├── README.md
└── src
    └── main
        ├── java
            └── org
            │   └── apache
            │       └── spark
            │           └── graphx
            │               ├── TripletFields.java
            │               └── impl
            │                   └── EdgeActiveness.java
        └── scala
            └── org
                └── apache
                    └── spark
                        └── graphx
                            ├── Edge.scala
                            ├── EdgeContext.scala
                            ├── EdgeDirection.scala
                            ├── EdgeRDD.scala
                            ├── EdgeTriplet.scala
                            ├── Graph.scala
                            ├── GraphKryoRegistrator.scala
                            ├── GraphLoader.scala
                            ├── GraphOps.scala
                            ├── GraphXUtils.scala
                            ├── PartitionStrategy.scala
                            ├── Pregel.scala
                            ├── VertexRDD.scala
                            ├── impl
                                ├── EdgePartition.scala
                                ├── EdgePartitionBuilder.scala
                                ├── EdgeRDDImpl.scala
                                ├── GraphImpl.scala
                                ├── ReplicatedVertexView.scala
                                ├── RoutingTablePartition.scala
                                ├── ShippableVertexPartition.scala
                                ├── VertexPartition.scala
                                ├── VertexPartitionBase.scala
                                ├── VertexPartitionBaseOps.scala
                                ├── VertexRDDImpl.scala
                                └── package.scala
                            ├── lib
                                ├── ConnectedComponents.scala
                                ├── LabelPropagation.scala
                                ├── PageRank.scala
                                ├── SVDPlusPlus.scala
                                ├── ShortestPaths.scala
                                ├── StronglyConnectedComponents.scala
                                ├── TriangleCount.scala
                                ├── package-info.java
                                └── package.scala
                            ├── package-info.java
                            ├── package.scala
                            └── util
                                ├── BytecodeUtils.scala
                                ├── GraphGenerators.scala
                                ├── collection
                                    └── GraphXPrimitiveKeyOpenHashMap.scala
                                ├── package-info.java
                                └── package.scala


/README.md:
--------------------------------------------------------------------------------
1 | # Spark图计算引擎GraphX的源码注释中文翻译版
2 | 
3 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/graphx/TripletFields.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx;
19 | 
20 | import java.io.Serializable;
21 | 
22 | /**
23 |  * Represents a subset of the fields of an [[EdgeTriplet]] or [[EdgeContext]]. This allows the
24 |  * system to populate only those fields for efficiency.
25 |  */
26 | public class TripletFields implements Serializable {
27 | 
28 |   /** Indicates whether the source vertex attribute is included. */
29 |   public final boolean useSrc;
30 | 
31 |   /** Indicates whether the destination vertex attribute is included. */
32 |   public final boolean useDst;
33 | 
34 |   /** Indicates whether the edge attribute is included. */
35 |   public final boolean useEdge;
36 | 
37 |   /** Constructs a default TripletFields in which all fields are included. */
38 |   public TripletFields() {
39 |     this(true, true, true);
40 |   }
41 | 
42 |   public TripletFields(boolean useSrc, boolean useDst, boolean useEdge) {
43 |     this.useSrc = useSrc;
44 |     this.useDst = useDst;
45 |     this.useEdge = useEdge;
46 |   }
47 | 
48 |   /**
49 |    * None of the triplet fields are exposed.
50 |    */
51 |   public static final TripletFields None = new TripletFields(false, false, false);
52 | 
53 |   /**
54 |    * Expose only the edge field and not the source or destination field.
55 |    */
56 |   public static final TripletFields EdgeOnly = new TripletFields(false, false, true);
57 | 
58 |   /**
59 |    * Expose the source and edge fields but not the destination field. (Same as Src)
60 |    */
61 |   public static final TripletFields Src = new TripletFields(true, false, true);
62 | 
63 |   /**
64 |    * Expose the destination and edge fields but not the source field. (Same as Dst)
65 |    */
66 |   public static final TripletFields Dst = new TripletFields(false, true, true);
67 | 
68 |   /**
69 |    * Expose all the fields (source, edge, and destination).
70 |    */
71 |   public static final TripletFields All = new TripletFields(true, true, true);
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/spark/graphx/impl/EdgeActiveness.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.impl;
19 | 
20 | /**
21 |  * Criteria for filtering edges based on activeness. For internal use only.
22 |  */
23 | public enum EdgeActiveness {
24 |   /** Neither the source vertex nor the destination vertex need be active. */
25 |   Neither,
26 |   /** The source vertex must be active. */
27 |   SrcOnly,
28 |   /** The destination vertex must be active. */
29 |   DstOnly,
30 |   /** Both vertices must be active. */
31 |   Both,
32 |   /** At least one vertex must be active. */
33 |   Either
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/Edge.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | import org.apache.spark.util.collection.SortDataFormat
21 | 
22 | /**
23 |  * A single directed edge consisting of a source id, target id,
24 |  * and the data associated with the edge.
25 |  *
26 |  * @tparam ED type of the edge attribute
27 |  *
28 |  * @param srcId The vertex id of the source vertex
29 |  * @param dstId The vertex id of the target vertex
30 |  * @param attr The attribute associated with the edge
31 |  */
32 | case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] (
33 |     var srcId: VertexId = 0,
34 |     var dstId: VertexId = 0,
35 |     var attr: ED = null.asInstanceOf[ED])
36 |   extends Serializable {
37 | 
38 |   /**
39 |    * Given one vertex in the edge return the other vertex.
40 |    *
41 |    * @param vid the id one of the two vertices on the edge.
42 |    * @return the id of the other vertex on the edge.
43 |    */
44 |   def otherVertexId(vid: VertexId): VertexId =
45 |     if (srcId == vid) dstId else { assert(dstId == vid); srcId }
46 | 
47 |   /**
48 |    * Return the relative direction of the edge to the corresponding
49 |    * vertex.
50 |    *
51 |    * @param vid the id of one of the two vertices in the edge.
52 |    * @return the relative direction of the edge to the corresponding
53 |    * vertex.
54 |    */
55 |   def relativeDirection(vid: VertexId): EdgeDirection =
56 |     if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In }
57 | }
58 | 
59 | object Edge {
60 |   private[graphx] def lexicographicOrdering[ED] = new Ordering[Edge[ED]] {
61 |     override def compare(a: Edge[ED], b: Edge[ED]): Int = {
62 |       if (a.srcId == b.srcId) {
63 |         if (a.dstId == b.dstId) 0
64 |         else if (a.dstId < b.dstId) -1
65 |         else 1
66 |       } else if (a.srcId < b.srcId) -1
67 |       else 1
68 |     }
69 |   }
70 | 
71 |   private[graphx] def edgeArraySortDataFormat[ED] = new SortDataFormat[Edge[ED], Array[Edge[ED]]] {
72 |     override def getKey(data: Array[Edge[ED]], pos: Int): Edge[ED] = {
73 |       data(pos)
74 |     }
75 | 
76 |     override def swap(data: Array[Edge[ED]], pos0: Int, pos1: Int): Unit = {
77 |       val tmp = data(pos0)
78 |       data(pos0) = data(pos1)
79 |       data(pos1) = tmp
80 |     }
81 | 
82 |     override def copyElement(
83 |         src: Array[Edge[ED]], srcPos: Int,
84 |         dst: Array[Edge[ED]], dstPos: Int) {
85 |       dst(dstPos) = src(srcPos)
86 |     }
87 | 
88 |     override def copyRange(
89 |         src: Array[Edge[ED]], srcPos: Int,
90 |         dst: Array[Edge[ED]], dstPos: Int, length: Int) {
91 |       System.arraycopy(src, srcPos, dst, dstPos, length)
92 |     }
93 | 
94 |     override def allocate(length: Int): Array[Edge[ED]] = {
95 |       new Array[Edge[ED]](length)
96 |     }
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/EdgeContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | /**
21 |  * Represents an edge along with its neighboring vertices and allows sending messages along the
22 |  * edge. Used in [[Graph#aggregateMessages]].
23 |  */
24 | abstract class EdgeContext[VD, ED, A] {
25 |   /** The vertex id of the edge's source vertex. */
26 |   def srcId: VertexId
27 |   /** The vertex id of the edge's destination vertex. */
28 |   def dstId: VertexId
29 |   /** The vertex attribute of the edge's source vertex. */
30 |   def srcAttr: VD
31 |   /** The vertex attribute of the edge's destination vertex. */
32 |   def dstAttr: VD
33 |   /** The attribute associated with the edge. */
34 |   def attr: ED
35 | 
36 |   /** Sends a message to the source vertex. */
37 |   def sendToSrc(msg: A): Unit
38 |   /** Sends a message to the destination vertex. */
39 |   def sendToDst(msg: A): Unit
40 | 
41 |   /** Converts the edge and vertex properties into an [[EdgeTriplet]] for convenience. */
42 |   def toEdgeTriplet: EdgeTriplet[VD, ED] = {
43 |     val et = new EdgeTriplet[VD, ED]
44 |     et.srcId = srcId
45 |     et.srcAttr = srcAttr
46 |     et.dstId = dstId
47 |     et.dstAttr = dstAttr
48 |     et.attr = attr
49 |     et
50 |   }
51 | }
52 | 
53 | object EdgeContext {
54 | 
55 |   /**
56 |    * Extractor mainly used for Graph#aggregateMessages*.
57 |    * Example:
58 |    * {{{
59 |    *  val messages = graph.aggregateMessages(
60 |    *    case ctx @ EdgeContext(_, _, _, _, attr) =>
61 |    *      ctx.sendToDst(attr)
62 |    *    , _ + _)
63 |    * }}}
64 |    */
65 |   def unapply[VD, ED, A](edge: EdgeContext[VD, ED, A]): Some[(VertexId, VertexId, VD, VD, ED)] =
66 |     Some(edge.srcId, edge.dstId, edge.srcAttr, edge.dstAttr, edge.attr)
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | /**
21 |  * The direction of a directed edge relative to a vertex.
22 |  */
23 | class EdgeDirection private (private val name: String) extends Serializable {
24 |   /**
25 |    * Reverse the direction of an edge.  An in becomes out,
26 |    * out becomes in and both and either remain the same.
27 |    */
28 |   def reverse: EdgeDirection = this match {
29 |     case EdgeDirection.In => EdgeDirection.Out
30 |     case EdgeDirection.Out => EdgeDirection.In
31 |     case EdgeDirection.Either => EdgeDirection.Either
32 |     case EdgeDirection.Both => EdgeDirection.Both
33 |   }
34 | 
35 |   override def toString: String = "EdgeDirection." + name
36 | 
37 |   override def equals(o: Any): Boolean = o match {
38 |     case other: EdgeDirection => other.name == name
39 |     case _ => false
40 |   }
41 | 
42 |   override def hashCode: Int = name.hashCode
43 | }
44 | 
45 | 
46 | /**
47 |  * A set of [[EdgeDirection]]s.
48 |  */
49 | object EdgeDirection {
50 |   /** Edges arriving at a vertex. */
51 |   final val In: EdgeDirection = new EdgeDirection("In")
52 | 
53 |   /** Edges originating from a vertex. */
54 |   final val Out: EdgeDirection = new EdgeDirection("Out")
55 | 
56 |   /** Edges originating from *or* arriving at a vertex of interest. */
57 |   final val Either: EdgeDirection = new EdgeDirection("Either")
58 | 
59 |   /** Edges originating from *and* arriving at a vertex of interest. */
60 |   final val Both: EdgeDirection = new EdgeDirection("Both")
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | import scala.language.existentials
 21 | import scala.reflect.ClassTag
 22 | 
 23 | import org.apache.spark.Dependency
 24 | import org.apache.spark.Partition
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.TaskContext
 27 | import org.apache.spark.rdd.RDD
 28 | import org.apache.spark.storage.StorageLevel
 29 | 
 30 | import org.apache.spark.graphx.impl.EdgePartition
 31 | import org.apache.spark.graphx.impl.EdgePartitionBuilder
 32 | import org.apache.spark.graphx.impl.EdgeRDDImpl
 33 | 
 34 | /**
 35 |  * `EdgeRDD[ED, VD]` extends `RDD[Edge[ED]]` by storing the edges in columnar format on each
 36 |  * partition for performance. It may additionally store the vertex attributes associated with each
 37 |  * edge to provide the triplet view. Shipping of the vertex attributes is managed by
 38 |  * `impl.ReplicatedVertexView`.
 39 |  */
 40 | abstract class EdgeRDD[ED](
 41 |     sc: SparkContext,
 42 |     deps: Seq[Dependency[_]]) extends RDD[Edge[ED]](sc, deps) {
 43 | 
 44 |   // scalastyle:off structural.type
 45 |   private[graphx] def partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])] forSome { type VD }
 46 |   // scalastyle:on structural.type
 47 | 
 48 |   //得到EdgeRDD所在分区
 49 |   override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
 50 | 
 51 |   override def compute(part: Partition, context: TaskContext): Iterator[Edge[ED]] = {
 52 |     val p = firstParent[(PartitionID, EdgePartition[ED, _])].iterator(part, context)
 53 |     if (p.hasNext) {
 54 |       p.next()._2.iterator.map(_.copy())
 55 |     } else {
 56 |       Iterator.empty
 57 |     }
 58 |   }
 59 | 
 60 |   /**
 61 |    * Map the values in an edge partitioning preserving the structure but changing the values.
 62 |    *对值做映射
 63 |    * @tparam ED2 the new edge value type
 64 |    * @param f the function from an edge to a new edge value
 65 |    * @return a new EdgeRDD containing the new edge values
 66 |    */
 67 |   def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDD[ED2]
 68 | 
 69 |   /**
 70 |    * Reverse all the edges in this RDD.
 71 |    *
 72 |    * @return a new EdgeRDD containing all the edges reversed
 73 |    */
 74 |   def reverse: EdgeRDD[ED]
 75 | 
 76 |   /**
 77 |    * Inner joins this EdgeRDD with another EdgeRDD, assuming both are partitioned using the same
 78 |    * [[PartitionStrategy]].
 79 |    *内连接相同的边（有相同的顶点边）
 80 |    * @param other the EdgeRDD to join with
 81 |    * @param f the join function applied to corresponding values of `this` and `other`
 82 |    * @return a new EdgeRDD containing only edges that appear in both `this` and `other`,
 83 |    *         with values supplied by `f`
 84 |    */
 85 |   def innerJoin[ED2: ClassTag, ED3: ClassTag]
 86 |       (other: EdgeRDD[ED2])
 87 |       (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]
 88 | 
 89 |   /**
 90 |    * Changes the target storage level while preserving all other properties of the
 91 |    * EdgeRDD. Operations on the returned EdgeRDD will preserve this storage level.
 92 |    *改变存储级别
 93 |    * This does not actually trigger a cache; to do this, call
 94 |    * [[org.apache.spark.graphx.EdgeRDD#cache]] on the returned EdgeRDD.
 95 |    */
 96 |   private[graphx] def withTargetStorageLevel(targetStorageLevel: StorageLevel): EdgeRDD[ED]
 97 | }
 98 | 
 99 | object EdgeRDD {
100 |   /**
101 |    * Creates an EdgeRDD from a set of edges.
102 |    *从RDD[Edge[ED]]生成EdgeRDD
103 |    * @tparam ED the edge attribute type
104 |    * @tparam VD the type of the vertex attributes that may be joined with the returned EdgeRDD
105 |    */
106 |   def fromEdges[ED: ClassTag, VD: ClassTag](edges: RDD[Edge[ED]]): EdgeRDDImpl[ED, VD] = {
107 |     val edgePartitions = edges.mapPartitionsWithIndex { (pid, iter) =>
108 |       val builder = new EdgePartitionBuilder[ED, VD]
109 |       iter.foreach { e =>
110 |         builder.add(e.srcId, e.dstId, e.attr)
111 |       }
112 |       Iterator((pid, builder.toEdgePartition))
113 |     }
114 |     EdgeRDD.fromEdgePartitions(edgePartitions)
115 |   }
116 | 
117 |   /**
118 |    * Creates an EdgeRDD from already-constructed edge partitions.
119 |    *从edge partitions生成EdgeRDD
120 |    * @tparam ED the edge attribute type
121 |    * @tparam VD the type of the vertex attributes that may be joined with the returned EdgeRDD
122 |    */
123 |   private[graphx] def fromEdgePartitions[ED: ClassTag, VD: ClassTag](
124 |       edgePartitions: RDD[(Int, EdgePartition[ED, VD])]): EdgeRDDImpl[ED, VD] = {
125 |     new EdgeRDDImpl(edgePartitions)
126 |   }
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | /**
21 |  * An edge triplet represents an edge along with the vertex attributes of its neighboring vertices.
22 |  *
23 |  * @tparam VD the type of the vertex attribute.
24 |  * @tparam ED the type of the edge attribute
25 |  */
26 | class EdgeTriplet[VD, ED] extends Edge[ED] {
27 |   /**
28 |    * The source vertex attribute
29 |    */
30 |   var srcAttr: VD = _ // nullValue[VD]
31 | 
32 |   /**
33 |    * The destination vertex attribute
34 |    */
35 |   var dstAttr: VD = _ // nullValue[VD]
36 | 
37 |   /**
38 |    * Set the edge properties of this triplet.
39 |    * 设置三元体边的属性
40 |    * 注意：srcId、dstId、attr是从超类中继承过来的
41 |    */
42 |   protected[spark] def set(other: Edge[ED]): EdgeTriplet[VD, ED] = {
43 |     srcId = other.srcId
44 |     dstId = other.dstId
45 |     attr = other.attr
46 |     this
47 |   }
48 | 
49 |   /**
50 |    * Given one vertex in the edge return the other vertex.
51 |    *已知边的一个顶点得到另一个顶点的属性
52 |    * @param vid the id one of the two vertices on the edge
53 |    * @return the attribute for the other vertex on the edge
54 |    */
55 |   def otherVertexAttr(vid: VertexId): VD =
56 |     if (srcId == vid) dstAttr else { assert(dstId == vid); srcAttr }
57 | 
58 |   /**
59 |    * Get the vertex object for the given vertex in the edge.
60 |    * 得到VertexId对应顶点的attr
61 |    *
62 |    * @param vid the id of one of the two vertices on the edge
63 |    * @return the attr for the vertex with that id
64 |    */
65 |   def vertexAttr(vid: VertexId): VD =
66 |     if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr }
67 | 
68 |   override def toString: String = ((srcId, srcAttr), (dstId, dstAttr), attr).toString()
69 | 
70 |   //转化成三元组
71 |   def toTuple: ((VertexId, VD), (VertexId, VD), ED) = ((srcId, srcAttr), (dstId, dstAttr), attr)
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | import com.esotericsoftware.kryo.Kryo
21 | 
22 | import org.apache.spark.serializer.KryoRegistrator
23 | import org.apache.spark.util.BoundedPriorityQueue
24 | import org.apache.spark.util.collection.BitSet
25 | 
26 | import org.apache.spark.graphx.impl._
27 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
28 | import org.apache.spark.util.collection.OpenHashSet
29 | 
30 | /**
31 |  * Registers GraphX classes with Kryo for improved performance.
32 |  */
33 | @deprecated("Register GraphX classes with Kryo using GraphXUtils.registerKryoClasses", "1.2.0")
34 | class GraphKryoRegistrator extends KryoRegistrator {
35 | 
36 |   def registerClasses(kryo: Kryo) {
37 |     kryo.register(classOf[Edge[Object]])
38 |     kryo.register(classOf[(VertexId, Object)])
39 |     kryo.register(classOf[EdgePartition[Object, Object]])
40 |     kryo.register(classOf[BitSet])
41 |     kryo.register(classOf[VertexIdToIndexMap])
42 |     kryo.register(classOf[VertexAttributeBlock[Object]])
43 |     kryo.register(classOf[PartitionStrategy])
44 |     kryo.register(classOf[BoundedPriorityQueue[Object]])
45 |     kryo.register(classOf[EdgeDirection])
46 |     kryo.register(classOf[GraphXPrimitiveKeyOpenHashMap[VertexId, Int]])
47 |     kryo.register(classOf[OpenHashSet[Int]])
48 |     kryo.register(classOf[OpenHashSet[Long]])
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/GraphLoader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | import org.apache.spark.storage.StorageLevel
 21 | import org.apache.spark.{Logging, SparkContext}
 22 | import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl}
 23 | 
 24 | /**
 25 |  * Provides utilities for loading [[Graph]]s from files.
 26 |  */
 27 | object GraphLoader extends Logging {
 28 | 
 29 |   /**
 30 |    * Loads a graph from an edge list formatted file where each line contains two integers: a source
 31 |    * id and a target id. Skips lines that begin with `#`.
 32 |    *
 33 |    * If desired the edges can be automatically oriented in the positive
 34 |    * direction (source Id < target Id) by setting `canonicalOrientation` to
 35 |    * true.
 36 |    *
 37 |    * @example Loads a file in the following format:
 38 |    * {{{
 39 |    * # Comment Line
 40 |    * # Source Id <\t> Target Id
 41 |    * 1   -5
 42 |    * 1    2
 43 |    * 2    7
 44 |    * 1    8
 45 |    * }}}
 46 |    *
 47 |    * @param sc SparkContext
 48 |    * @param path the path to the file (e.g., /home/data/file or hdfs://file)
 49 |    * @param canonicalOrientation whether to orient edges in the positive
 50 |    *        direction
 51 |    * @param numEdgePartitions the number of partitions for the edge RDD
 52 |    * Setting this value to -1 will use the default parallelism.
 53 |    * @param edgeStorageLevel the desired storage level for the edge partitions
 54 |    * @param vertexStorageLevel the desired storage level for the vertex partitions
 55 |    */
 56 |   def edgeListFile(
 57 |       sc: SparkContext,
 58 |       path: String,
 59 |       canonicalOrientation: Boolean = false,
 60 |       numEdgePartitions: Int = -1,
 61 |       edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
 62 |       vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
 63 |     : Graph[Int, Int] =
 64 |   {
 65 |     val startTime = System.currentTimeMillis
 66 | 
 67 |     // Parse the edge data table directly into edge partitions
 68 |     val lines =
 69 |       if (numEdgePartitions > 0) {
 70 |         sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions)
 71 |       } else {
 72 |         sc.textFile(path)
 73 |       }
 74 |     val edges = lines.mapPartitionsWithIndex { (pid, iter) =>
 75 |       val builder = new EdgePartitionBuilder[Int, Int]
 76 |       iter.foreach { line =>
 77 |         if (!line.isEmpty && line(0) != '#') {
 78 |           val lineArray = line.split("\\s+")
 79 |           if (lineArray.length < 2) {
 80 |             throw new IllegalArgumentException("Invalid line: " + line)
 81 |           }
 82 |           val srcId = lineArray(0).toLong
 83 |           val dstId = lineArray(1).toLong
 84 |           if (canonicalOrientation && srcId > dstId) {
 85 |             builder.add(dstId, srcId, 1)
 86 |           } else {
 87 |             builder.add(srcId, dstId, 1)
 88 |           }
 89 |         }
 90 |       }
 91 |       Iterator((pid, builder.toEdgePartition))
 92 |     }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path))
 93 |     edges.count()
 94 | 
 95 |     logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime))
 96 | 
 97 |     GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel,
 98 |       vertexStorageLevel = vertexStorageLevel)
 99 |   } // end of edgeListFile
100 | 
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/GraphOps.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | import scala.reflect.ClassTag
 21 | import scala.util.Random
 22 | 
 23 | import org.apache.spark.SparkException
 24 | import org.apache.spark.SparkContext._
 25 | import org.apache.spark.rdd.RDD
 26 | 
 27 | import org.apache.spark.graphx.lib._
 28 | 
 29 | /**
 30 |  * Contains additional functionality for [[Graph]]. All operations are expressed in terms of the
 31 |  * efficient GraphX API. This class is implicitly constructed for each Graph object.
 32 |  *
 33 |  * @tparam VD the vertex attribute type
 34 |  * @tparam ED the edge attribute type
 35 |  */
 36 | class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Serializable {
 37 | 
 38 |   /** The number of edges in the graph. */
 39 |   @transient lazy val numEdges: Long = graph.edges.count()
 40 | 
 41 |   /** The number of vertices in the graph. */
 42 |   @transient lazy val numVertices: Long = graph.vertices.count()
 43 | 
 44 |   /**
 45 |    * The in-degree of each vertex in the graph.
 46 |    * @note Vertices with no in-edges are not returned in the resulting RDD.
 47 |    */
 48 |   @transient lazy val inDegrees: VertexRDD[Int] =
 49 |     degreesRDD(EdgeDirection.In).setName("GraphOps.inDegrees")
 50 | 
 51 |   /**
 52 |    * The out-degree of each vertex in the graph.
 53 |    * @note Vertices with no out-edges are not returned in the resulting RDD.
 54 |    */
 55 |   @transient lazy val outDegrees: VertexRDD[Int] =
 56 |     degreesRDD(EdgeDirection.Out).setName("GraphOps.outDegrees")
 57 | 
 58 |   /**
 59 |    * The degree of each vertex in the graph.
 60 |    * @note Vertices with no edges are not returned in the resulting RDD.
 61 |    */
 62 |   @transient lazy val degrees: VertexRDD[Int] =
 63 |     degreesRDD(EdgeDirection.Either).setName("GraphOps.degrees")
 64 | 
 65 |   /**
 66 |    * Computes the neighboring vertex degrees.
 67 |    *
 68 |    * @param edgeDirection the direction along which to collect neighboring vertex attributes
 69 |    */
 70 |   private def degreesRDD(edgeDirection: EdgeDirection): VertexRDD[Int] = {
 71 |     if (edgeDirection == EdgeDirection.In) {
 72 |       graph.aggregateMessages(_.sendToDst(1), _ + _, TripletFields.None)
 73 |     } else if (edgeDirection == EdgeDirection.Out) {
 74 |       graph.aggregateMessages(_.sendToSrc(1), _ + _, TripletFields.None)
 75 |     } else { // EdgeDirection.Either
 76 |       graph.aggregateMessages(ctx => { ctx.sendToSrc(1); ctx.sendToDst(1) }, _ + _,
 77 |         TripletFields.None)
 78 |     }
 79 |   }
 80 | 
 81 |   /**
 82 |    * Collect the neighbor vertex ids for each vertex.
 83 |    *
 84 |    * @param edgeDirection the direction along which to collect
 85 |    * neighboring vertices
 86 |    *
 87 |    * @return the set of neighboring ids for each vertex
 88 |    */
 89 |   def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]] = {
 90 |     val nbrs =
 91 |       if (edgeDirection == EdgeDirection.Either) {
 92 |         graph.aggregateMessages[Array[VertexId]](
 93 |           ctx => { ctx.sendToSrc(Array(ctx.dstId)); ctx.sendToDst(Array(ctx.srcId)) },
 94 |           _ ++ _, TripletFields.None)
 95 |       } else if (edgeDirection == EdgeDirection.Out) {
 96 |         graph.aggregateMessages[Array[VertexId]](
 97 |           ctx => ctx.sendToSrc(Array(ctx.dstId)),
 98 |           _ ++ _, TripletFields.None)
 99 |       } else if (edgeDirection == EdgeDirection.In) {
100 |         graph.aggregateMessages[Array[VertexId]](
101 |           ctx => ctx.sendToDst(Array(ctx.srcId)),
102 |           _ ++ _, TripletFields.None)
103 |       } else {
104 |         throw new SparkException("It doesn't make sense to collect neighbor ids without a " +
105 |           "direction. (EdgeDirection.Both is not supported; use EdgeDirection.Either instead.)")
106 |       }
107 |     graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) =>
108 |       nbrsOpt.getOrElse(Array.empty[VertexId])
109 |     }
110 |   } // end of collectNeighborIds
111 | 
112 |   /**
113 |    * Collect the neighbor vertex attributes for each vertex.
114 |    *
115 |    * @note This function could be highly inefficient on power-law
116 |    * graphs where high degree vertices may force a large amount of
117 |    * information to be collected to a single location.
118 |    *
119 |    * @param edgeDirection the direction along which to collect
120 |    * neighboring vertices
121 |    *
122 |    * @return the vertex set of neighboring vertex attributes for each vertex
123 |    */
124 |   def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]] = {
125 |     val nbrs = edgeDirection match {
126 |       case EdgeDirection.Either =>
127 |         graph.aggregateMessages[Array[(VertexId, VD)]](
128 |           ctx => {
129 |             ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr)))
130 |             ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr)))
131 |           },
132 |           (a, b) => a ++ b, TripletFields.All)
133 |       case EdgeDirection.In =>
134 |         graph.aggregateMessages[Array[(VertexId, VD)]](
135 |           ctx => ctx.sendToDst(Array((ctx.srcId, ctx.srcAttr))),
136 |           (a, b) => a ++ b, TripletFields.Src)
137 |       case EdgeDirection.Out =>
138 |         graph.aggregateMessages[Array[(VertexId, VD)]](
139 |           ctx => ctx.sendToSrc(Array((ctx.dstId, ctx.dstAttr))),
140 |           (a, b) => a ++ b, TripletFields.Dst)
141 |       case EdgeDirection.Both =>
142 |         throw new SparkException("collectEdges does not support EdgeDirection.Both. Use" +
143 |           "EdgeDirection.Either instead.")
144 |     }
145 |     graph.vertices.leftJoin(nbrs) { (vid, vdata, nbrsOpt) =>
146 |       nbrsOpt.getOrElse(Array.empty[(VertexId, VD)])
147 |     }
148 |   } // end of collectNeighbor
149 | 
150 |   /**
151 |    * Returns an RDD that contains for each vertex v its local edges,
152 |    * i.e., the edges that are incident on v, in the user-specified direction.
153 |    * Warning: note that singleton vertices, those with no edges in the given
154 |    * direction will not be part of the return value.
155 |    *
156 |    * @note This function could be highly inefficient on power-law
157 |    * graphs where high degree vertices may force a large amount of
158 |    * information to be collected to a single location.
159 |    *
160 |    * @param edgeDirection the direction along which to collect
161 |    * the local edges of vertices
162 |    *
163 |    * @return the local edges for each vertex
164 |    */
165 |   def collectEdges(edgeDirection: EdgeDirection): VertexRDD[Array[Edge[ED]]] = {
166 |     edgeDirection match {
167 |       case EdgeDirection.Either =>
168 |         graph.aggregateMessages[Array[Edge[ED]]](
169 |           ctx => {
170 |             ctx.sendToSrc(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr)))
171 |             ctx.sendToDst(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr)))
172 |           },
173 |           (a, b) => a ++ b, TripletFields.EdgeOnly)
174 |       case EdgeDirection.In =>
175 |         graph.aggregateMessages[Array[Edge[ED]]](
176 |           ctx => ctx.sendToDst(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))),
177 |           (a, b) => a ++ b, TripletFields.EdgeOnly)
178 |       case EdgeDirection.Out =>
179 |         graph.aggregateMessages[Array[Edge[ED]]](
180 |           ctx => ctx.sendToSrc(Array(new Edge(ctx.srcId, ctx.dstId, ctx.attr))),
181 |           (a, b) => a ++ b, TripletFields.EdgeOnly)
182 |       case EdgeDirection.Both =>
183 |         throw new SparkException("collectEdges does not support EdgeDirection.Both. Use" +
184 |           "EdgeDirection.Either instead.")
185 |     }
186 |   }
187 | 
188 |   /**
189 |    * Join the vertices with an RDD and then apply a function from the
190 |    * vertex and RDD entry to a new vertex value.  The input table
191 |    * should contain at most one entry for each vertex.  If no entry is
192 |    * provided the map function is skipped and the old value is used.
193 |    *
194 |    * @tparam U the type of entry in the table of updates
195 |    * @param table the table to join with the vertices in the graph.
196 |    * The table should contain at most one entry for each vertex.
197 |    * @param mapFunc the function used to compute the new vertex
198 |    * values.  The map function is invoked only for vertices with a
199 |    * corresponding entry in the table otherwise the old vertex value
200 |    * is used.
201 |    *
202 |    * @example This function is used to update the vertices with new
203 |    * values based on external data.  For example we could add the out
204 |    * degree to each vertex record
205 |    *
206 |    * {{{
207 |    * val rawGraph: Graph[Int, Int] = GraphLoader.edgeListFile(sc, "webgraph")
208 |    *   .mapVertices((_, _) => 0)
209 |    * val outDeg = rawGraph.outDegrees
210 |    * val graph = rawGraph.joinVertices[Int](outDeg)
211 |    *   ((_, _, outDeg) => outDeg)
212 |    * }}}
213 |    *
214 |    */
215 |   def joinVertices[U: ClassTag](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD)
216 |     : Graph[VD, ED] = {
217 |     val uf = (id: VertexId, data: VD, o: Option[U]) => {
218 |       o match {
219 |         case Some(u) => mapFunc(id, data, u)
220 |         case None => data
221 |       }
222 |     }
223 |     graph.outerJoinVertices(table)(uf)
224 |   }
225 | 
226 |   /**
227 |    * Filter the graph by computing some values to filter on, and applying the predicates.
228 |    *
229 |    * @param preprocess a function to compute new vertex and edge data before filtering
230 |    * @param epred edge pred to filter on after preprocess, see more details under
231 |    *  [[org.apache.spark.graphx.Graph#subgraph]]
232 |    * @param vpred vertex pred to filter on after prerocess, see more details under
233 |    *  [[org.apache.spark.graphx.Graph#subgraph]]
234 |    * @tparam VD2 vertex type the vpred operates on
235 |    * @tparam ED2 edge type the epred operates on
236 |    * @return a subgraph of the orginal graph, with its data unchanged
237 |    *
238 |    * @example This function can be used to filter the graph based on some property, without
239 |    * changing the vertex and edge values in your program. For example, we could remove the vertices
240 |    * in a graph with 0 outdegree
241 |    *
242 |    * {{{
243 |    * graph.filter(
244 |    *   graph => {
245 |    *     val degrees: VertexRDD[Int] = graph.outDegrees
246 |    *     graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)}
247 |    *   },
248 |    *   vpred = (vid: VertexId, deg:Int) => deg > 0
249 |    * )
250 |    * }}}
251 |    *
252 |    */
253 |   def filter[VD2: ClassTag, ED2: ClassTag](
254 |       preprocess: Graph[VD, ED] => Graph[VD2, ED2],
255 |       epred: (EdgeTriplet[VD2, ED2]) => Boolean = (x: EdgeTriplet[VD2, ED2]) => true,
256 |       vpred: (VertexId, VD2) => Boolean = (v: VertexId, d: VD2) => true): Graph[VD, ED] = {
257 |     graph.mask(preprocess(graph).subgraph(epred, vpred))
258 |   }
259 | 
260 |   /**
261 |    * Picks a random vertex from the graph and returns its ID.
262 |    */
263 |   def pickRandomVertex(): VertexId = {
264 |     val probability = 50.0 / graph.numVertices
265 |     var found = false
266 |     var retVal: VertexId = null.asInstanceOf[VertexId]
267 |     while (!found) {
268 |       val selectedVertices = graph.vertices.flatMap { vidVvals =>
269 |         if (Random.nextDouble() < probability) { Some(vidVvals._1) }
270 |         else { None }
271 |       }
272 |       if (selectedVertices.count > 1) {
273 |         found = true
274 |         val collectedVertices = selectedVertices.collect()
275 |         retVal = collectedVertices(Random.nextInt(collectedVertices.size))
276 |       }
277 |     }
278 |    retVal
279 |   }
280 | 
281 |   /**
282 |    * Convert bi-directional edges into uni-directional ones.
283 |    * Some graph algorithms (e.g., TriangleCount) assume that an input graph
284 |    * has its edges in canonical direction.
285 |    * This function rewrites the vertex ids of edges so that srcIds are smaller
286 |    * than dstIds, and merges the duplicated edges.
287 |    *
288 |    * @param mergeFunc the user defined reduce function which should
289 |    * be commutative and associative and is used to combine the output
290 |    * of the map phase
291 |    *
292 |    * @return the resulting graph with canonical edges
293 |    */
294 |   def convertToCanonicalEdges(
295 |       mergeFunc: (ED, ED) => ED = (e1, e2) => e1): Graph[VD, ED] = {
296 |     val newEdges =
297 |       graph.edges
298 |         .map {
299 |           case e if e.srcId < e.dstId => ((e.srcId, e.dstId), e.attr)
300 |           case e => ((e.dstId, e.srcId), e.attr)
301 |         }
302 |         .reduceByKey(mergeFunc)
303 |         .map(e => new Edge(e._1._1, e._1._2, e._2))
304 |     Graph(graph.vertices, newEdges)
305 |   }
306 | 
307 |   /**
308 |    * Execute a Pregel-like iterative vertex-parallel abstraction.  The
309 |    * user-defined vertex-program `vprog` is executed in parallel on
310 |    * each vertex receiving any inbound messages and computing a new
311 |    * value for the vertex.  The `sendMsg` function is then invoked on
312 |    * all out-edges and is used to compute an optional message to the
313 |    * destination vertex. The `mergeMsg` function is a commutative
314 |    * associative function used to combine messages destined to the
315 |    * same vertex.
316 |    *
317 |    * On the first iteration all vertices receive the `initialMsg` and
318 |    * on subsequent iterations if a vertex does not receive a message
319 |    * then the vertex-program is not invoked.
320 |    *
321 |    * This function iterates until there are no remaining messages, or
322 |    * for `maxIterations` iterations.
323 |    *
324 |    * @tparam A the Pregel message type
325 |    *
326 |    * @param initialMsg the message each vertex will receive at the on
327 |    * the first iteration
328 |    *
329 |    * @param maxIterations the maximum number of iterations to run for
330 |    *
331 |    * @param activeDirection the direction of edges incident to a vertex that received a message in
332 |    * the previous round on which to run `sendMsg`. For example, if this is `EdgeDirection.Out`, only
333 |    * out-edges of vertices that received a message in the previous round will run.
334 |    *
335 |    * @param vprog the user-defined vertex program which runs on each
336 |    * vertex and receives the inbound message and computes a new vertex
337 |    * value.  On the first iteration the vertex program is invoked on
338 |    * all vertices and is passed the default message.  On subsequent
339 |    * iterations the vertex program is only invoked on those vertices
340 |    * that receive messages.
341 |    *
342 |    * @param sendMsg a user supplied function that is applied to out
343 |    * edges of vertices that received messages in the current
344 |    * iteration
345 |    *
346 |    * @param mergeMsg a user supplied function that takes two incoming
347 |    * messages of type A and merges them into a single message of type
348 |    * A.  ''This function must be commutative and associative and
349 |    * ideally the size of A should not increase.''
350 |    *
351 |    * @return the resulting graph at the end of the computation
352 |    *
353 |    */
354 |   def pregel[A: ClassTag](
355 |       initialMsg: A,
356 |       maxIterations: Int = Int.MaxValue,
357 |       activeDirection: EdgeDirection = EdgeDirection.Either)(
358 |       vprog: (VertexId, VD, A) => VD,
359 |       sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
360 |       mergeMsg: (A, A) => A)
361 |     : Graph[VD, ED] = {
362 |     Pregel(graph, initialMsg, maxIterations, activeDirection)(vprog, sendMsg, mergeMsg)
363 |   }
364 | 
365 |   /**
366 |    * Run a dynamic version of PageRank returning a graph with vertex attributes containing the
367 |    * PageRank and edge attributes containing the normalized edge weight.
368 |    *
369 |    * @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergence]]
370 |    */
371 |   def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double] = {
372 |     PageRank.runUntilConvergence(graph, tol, resetProb)
373 |   }
374 | 
375 | 
376 |   /**
377 |    * Run personalized PageRank for a given vertex, such that all random walks
378 |    * are started relative to the source node.
379 |    *
380 |    * @see [[org.apache.spark.graphx.lib.PageRank$#runUntilConvergenceWithOptions]]
381 |    */
382 |   def personalizedPageRank(src: VertexId, tol: Double,
383 |     resetProb: Double = 0.15) : Graph[Double, Double] = {
384 |     PageRank.runUntilConvergenceWithOptions(graph, tol, resetProb, Some(src))
385 |   }
386 | 
387 |   /**
388 |    * Run Personalized PageRank for a fixed number of iterations with
389 |    * with all iterations originating at the source node
390 |    * returning a graph with vertex attributes
391 |    * containing the PageRank and edge attributes the normalized edge weight.
392 |    *
393 |    * @see [[org.apache.spark.graphx.lib.PageRank$#runWithOptions]]
394 |    */
395 |   def staticPersonalizedPageRank(src: VertexId, numIter: Int,
396 |     resetProb: Double = 0.15) : Graph[Double, Double] = {
397 |     PageRank.runWithOptions(graph, numIter, resetProb, Some(src))
398 |   }
399 | 
400 |   /**
401 |    * Run PageRank for a fixed number of iterations returning a graph with vertex attributes
402 |    * containing the PageRank and edge attributes the normalized edge weight.
403 |    *
404 |    * @see [[org.apache.spark.graphx.lib.PageRank$#run]]
405 |    */
406 |   def staticPageRank(numIter: Int, resetProb: Double = 0.15): Graph[Double, Double] = {
407 |     PageRank.run(graph, numIter, resetProb)
408 |   }
409 | 
410 |   /**
411 |    * Compute the connected component membership of each vertex and return a graph with the vertex
412 |    * value containing the lowest vertex id in the connected component containing that vertex.
413 |    *
414 |    * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]]
415 |    */
416 |   def connectedComponents(): Graph[VertexId, ED] = {
417 |     ConnectedComponents.run(graph)
418 |   }
419 | 
420 |   /**
421 |    * Compute the number of triangles passing through each vertex.
422 |    *
423 |    * @see [[org.apache.spark.graphx.lib.TriangleCount$#run]]
424 |    */
425 |   def triangleCount(): Graph[Int, ED] = {
426 |     TriangleCount.run(graph)
427 |   }
428 | 
429 |   /**
430 |    * Compute the strongly connected component (SCC) of each vertex and return a graph with the
431 |    * vertex value containing the lowest vertex id in the SCC containing that vertex.
432 |    *
433 |    * @see [[org.apache.spark.graphx.lib.StronglyConnectedComponents$#run]]
434 |    */
435 |   def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED] = {
436 |     StronglyConnectedComponents.run(graph, numIter)
437 |   }
438 | } // end of GraphOps
439 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/GraphXUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | import org.apache.spark.SparkConf
21 | 
22 | import org.apache.spark.graphx.impl._
23 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
24 | 
25 | import org.apache.spark.util.collection.{OpenHashSet, BitSet}
26 | import org.apache.spark.util.BoundedPriorityQueue
27 | 
28 | object GraphXUtils {
29 |   /**
30 |    * Registers classes that GraphX uses with Kryo.
31 |    */
32 |   def registerKryoClasses(conf: SparkConf) {
33 |     conf.registerKryoClasses(Array(
34 |       classOf[Edge[Object]],
35 |       classOf[(VertexId, Object)],
36 |       classOf[EdgePartition[Object, Object]],
37 |       classOf[BitSet],
38 |       classOf[VertexIdToIndexMap],
39 |       classOf[VertexAttributeBlock[Object]],
40 |       classOf[PartitionStrategy],
41 |       classOf[BoundedPriorityQueue[Object]],
42 |       classOf[EdgeDirection],
43 |       classOf[GraphXPrimitiveKeyOpenHashMap[VertexId, Int]],
44 |       classOf[OpenHashSet[Int]],
45 |       classOf[OpenHashSet[Long]]))
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | /**
 21 |  * Represents the way edges are assigned to edge partitions based on their source and destination
 22 |  * vertex IDs.
 23 |  */
 24 | trait PartitionStrategy extends Serializable {
 25 |   /** Returns the partition number for a given edge. */
 26 |   def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID
 27 | }
 28 | 
 29 | /**
 30 |  * Collection of built-in [[PartitionStrategy]] implementations.
 31 |  */
 32 | object PartitionStrategy {
 33 |   /**
 34 |    * Assigns edges to partitions using a 2D partitioning of the sparse edge adjacency matrix,
 35 |    * guaranteeing a `2 * sqrt(numParts)` bound on vertex replication.
 36 |    *
 37 |    * Suppose we have a graph with 12 vertices that we want to partition
 38 |    * over 9 machines.  We can use the following sparse matrix representation:
 39 |    *
 40 |    * <pre>
 41 |    *       __________________________________
 42 |    *  v0   | P0 *     | P1       | P2    *  |
 43 |    *  v1   |  ****    |  *       |          |
 44 |    *  v2   |  ******* |      **  |  ****    |
 45 |    *  v3   |  *****   |  *  *    |       *  |
 46 |    *       ----------------------------------
 47 |    *  v4   | P3 *     | P4 ***   | P5 **  * |
 48 |    *  v5   |  *  *    |  *       |          |
 49 |    *  v6   |       *  |      **  |  ****    |
 50 |    *  v7   |  * * *   |  *  *    |       *  |
 51 |    *       ----------------------------------
 52 |    *  v8   | P6   *   | P7    *  | P8  *   *|
 53 |    *  v9   |     *    |  *    *  |          |
 54 |    *  v10  |       *  |      **  |  *  *    |
 55 |    *  v11  | * <-E    |  ***     |       ** |
 56 |    *       ----------------------------------
 57 |    * </pre>
 58 |    *
 59 |    * The edge denoted by `E` connects `v11` with `v1` and is assigned to processor `P6`. To get the
 60 |    * processor number we divide the matrix into `sqrt(numParts)` by `sqrt(numParts)` blocks. Notice
 61 |    * that edges adjacent to `v11` can only be in the first column of blocks `(P0, P3,
 62 |    * P6)` or the last
 63 |    * row of blocks `(P6, P7, P8)`.  As a consequence we can guarantee that `v11` will need to be
 64 |    * replicated to at most `2 * sqrt(numParts)` machines.
 65 |    *
 66 |    * Notice that `P0` has many edges and as a consequence this partitioning would lead to poor work
 67 |    * balance.  To improve balance we first multiply each vertex id by a large prime to shuffle the
 68 |    * vertex locations.
 69 |    *
 70 |    * When the number of partitions requested is not a perfect square we use a slightly different
 71 |    * method where the last column can have a different number of rows than the others while still
 72 |    * maintaining the same size per block.
 73 |    */
 74 |   case object EdgePartition2D extends PartitionStrategy {
 75 |     override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
 76 |       val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt
 77 |       val mixingPrime: VertexId = 1125899906842597L
 78 |       if (numParts == ceilSqrtNumParts * ceilSqrtNumParts) {
 79 |         // Use old method for perfect squared to ensure we get same results
 80 |         val col: PartitionID = (math.abs(src * mixingPrime) % ceilSqrtNumParts).toInt
 81 |         val row: PartitionID = (math.abs(dst * mixingPrime) % ceilSqrtNumParts).toInt
 82 |         (col * ceilSqrtNumParts + row) % numParts
 83 | 
 84 |       } else {
 85 |         // Otherwise use new method
 86 |         val cols = ceilSqrtNumParts
 87 |         val rows = (numParts + cols - 1) / cols
 88 |         val lastColRows = numParts - rows * (cols - 1)
 89 |         val col = (math.abs(src * mixingPrime) % numParts / rows).toInt
 90 |         val row = (math.abs(dst * mixingPrime) % (if (col < cols - 1) rows else lastColRows)).toInt
 91 |         col * rows + row
 92 | 
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   /**
 98 |    * Assigns edges to partitions using only the source vertex ID, colocating edges with the same
 99 |    * source.
100 |    */
101 |   case object EdgePartition1D extends PartitionStrategy {
102 |     override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
103 |       val mixingPrime: VertexId = 1125899906842597L
104 |       (math.abs(src * mixingPrime) % numParts).toInt
105 |     }
106 |   }
107 | 
108 | 
109 |   /**
110 |    * Assigns edges to partitions by hashing the source and destination vertex IDs, resulting in a
111 |    * random vertex cut that colocates all same-direction edges between two vertices.
112 |    */
113 |   case object RandomVertexCut extends PartitionStrategy {
114 |     override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
115 |       math.abs((src, dst).hashCode()) % numParts
116 |     }
117 |   }
118 | 
119 | 
120 |   /**
121 |    * Assigns edges to partitions by hashing the source and destination vertex IDs in a canonical
122 |    * direction, resulting in a random vertex cut that colocates all edges between two vertices,
123 |    * regardless of direction.
124 |    */
125 |   case object CanonicalRandomVertexCut extends PartitionStrategy {
126 |     override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
127 |       if (src < dst) {
128 |         math.abs((src, dst).hashCode()) % numParts
129 |       } else {
130 |         math.abs((dst, src).hashCode()) % numParts
131 |       }
132 |     }
133 |   }
134 | 
135 |   /** Returns the PartitionStrategy with the specified name. */
136 |   def fromString(s: String): PartitionStrategy = s match {
137 |     case "RandomVertexCut" => RandomVertexCut
138 |     case "EdgePartition1D" => EdgePartition1D
139 |     case "EdgePartition2D" => EdgePartition2D
140 |     case "CanonicalRandomVertexCut" => CanonicalRandomVertexCut
141 |     case _ => throw new IllegalArgumentException("Invalid PartitionStrategy: " + s)
142 |   }
143 | }
144 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/Pregel.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | import scala.reflect.ClassTag
 21 | import org.apache.spark.Logging
 22 | 
 23 | 
 24 | /**
 25 |  * Implements a Pregel-like bulk-synchronous message-passing API.
 26 |  *
 27 |  * Unlike the original Pregel API, the GraphX Pregel API factors the sendMessage computation over
 28 |  * edges, enables the message sending computation to read both vertex attributes, and constrains
 29 |  * messages to the graph structure.  These changes allow for substantially more efficient
 30 |  * distributed execution while also exposing greater flexibility for graph-based computation.
 31 |  *
 32 |  * @example We can use the Pregel abstraction to implement PageRank:
 33 |  * {{{
 34 |  * val pagerankGraph: Graph[Double, Double] = graph
 35 |  *   // Associate the degree with each vertex
 36 |  *   .outerJoinVertices(graph.outDegrees) {
 37 |  *     (vid, vdata, deg) => deg.getOrElse(0)
 38 |  *   }
 39 |  *   // Set the weight on the edges based on the degree
 40 |  *   .mapTriplets(e => 1.0 / e.srcAttr)
 41 |  *   // Set the vertex attributes to the initial pagerank values
 42 |  *   .mapVertices((id, attr) => 1.0)
 43 |  *
 44 |  * def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double =
 45 |  *   resetProb + (1.0 - resetProb) * msgSum
 46 |  * def sendMessage(id: VertexId, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] =
 47 |  *   Iterator((edge.dstId, edge.srcAttr * edge.attr))
 48 |  * def messageCombiner(a: Double, b: Double): Double = a + b
 49 |  * val initialMessage = 0.0
 50 |  * // Execute Pregel for a fixed number of iterations.
 51 |  * Pregel(pagerankGraph, initialMessage, numIter)(
 52 |  *   vertexProgram, sendMessage, messageCombiner)
 53 |  * }}}
 54 |  *
 55 |  */
 56 | object Pregel extends Logging {
 57 | 
 58 |   /**
 59 |    * Execute a Pregel-like iterative vertex-parallel abstraction.  The
 60 |    * user-defined vertex-program `vprog` is executed in parallel on
 61 |    * each vertex receiving any inbound messages and computing a new
 62 |    * value for the vertex.  The `sendMsg` function is then invoked on
 63 |    * all out-edges and is used to compute an optional message to the
 64 |    * destination vertex. The `mergeMsg` function is a commutative
 65 |    * associative function used to combine messages destined to the
 66 |    * same vertex.
 67 |    *
 68 |    * On the first iteration all vertices receive the `initialMsg` and
 69 |    * on subsequent iterations if a vertex does not receive a message
 70 |    * then the vertex-program is not invoked.
 71 |    *
 72 |    * This function iterates until there are no remaining messages, or
 73 |    * for `maxIterations` iterations.
 74 |    *
 75 |    * @tparam VD the vertex data type
 76 |    * @tparam ED the edge data type
 77 |    * @tparam A the Pregel message type
 78 |    *
 79 |    * @param graph the input graph.
 80 |    *
 81 |    * @param initialMsg the message each vertex will receive at the first
 82 |    * iteration
 83 |    *
 84 |    * @param maxIterations the maximum number of iterations to run for
 85 |    *
 86 |    * @param activeDirection the direction of edges incident to a vertex that received a message in
 87 |    * the previous round on which to run `sendMsg`. For example, if this is `EdgeDirection.Out`, only
 88 |    * out-edges of vertices that received a message in the previous round will run. The default is
 89 |    * `EdgeDirection.Either`, which will run `sendMsg` on edges where either side received a message
 90 |    * in the previous round. If this is `EdgeDirection.Both`, `sendMsg` will only run on edges where
 91 |    * *both* vertices received a message.
 92 |    *
 93 |    * @param vprog the user-defined vertex program which runs on each
 94 |    * vertex and receives the inbound message and computes a new vertex
 95 |    * value.  On the first iteration the vertex program is invoked on
 96 |    * all vertices and is passed the default message.  On subsequent
 97 |    * iterations the vertex program is only invoked on those vertices
 98 |    * that receive messages.
 99 |    *
100 |    * @param sendMsg a user supplied function that is applied to out
101 |    * edges of vertices that received messages in the current
102 |    * iteration
103 |    *
104 |    * @param mergeMsg a user supplied function that takes two incoming
105 |    * messages of type A and merges them into a single message of type
106 |    * A.  ''This function must be commutative and associative and
107 |    * ideally the size of A should not increase.''
108 |    *
109 |    * @return the resulting graph at the end of the computation
110 |    *
111 |    */
112 |   def apply[VD: ClassTag, ED: ClassTag, A: ClassTag]
113 |      (graph: Graph[VD, ED],
114 |       initialMsg: A,
115 |       maxIterations: Int = Int.MaxValue,
116 |       activeDirection: EdgeDirection = EdgeDirection.Either)
117 |      (vprog: (VertexId, VD, A) => VD,
118 |       sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
119 |       mergeMsg: (A, A) => A)
120 |     : Graph[VD, ED] =
121 |   {
122 |     var g = graph.mapVertices((vid, vdata) => vprog(vid, vdata, initialMsg)).cache()
123 |     // compute the messages
124 |     var messages = g.mapReduceTriplets(sendMsg, mergeMsg)
125 |     var activeMessages = messages.count()
126 |     // Loop
127 |     var prevG: Graph[VD, ED] = null
128 |     var i = 0
129 |     while (activeMessages > 0 && i < maxIterations) {
130 |       // Receive the messages and update the vertices.
131 |       prevG = g
132 |       g = g.joinVertices(messages)(vprog).cache()
133 | 
134 |       val oldMessages = messages
135 |       // Send new messages, skipping edges where neither side received a message. We must cache
136 |       // messages so it can be materialized on the next line, allowing us to uncache the previous
137 |       // iteration.
138 |       messages = g.mapReduceTriplets(
139 |         sendMsg, mergeMsg, Some((oldMessages, activeDirection))).cache()
140 |       // The call to count() materializes `messages` and the vertices of `g`. This hides oldMessages
141 |       // (depended on by the vertices of g) and the vertices of prevG (depended on by oldMessages
142 |       // and the vertices of g).
143 |       activeMessages = messages.count()
144 | 
145 |       logInfo("Pregel finished iteration " + i)
146 | 
147 |       // Unpersist the RDDs hidden by newly-materialized RDDs
148 |       oldMessages.unpersist(blocking = false)
149 |       prevG.unpersistVertices(blocking = false)
150 |       prevG.edges.unpersist(blocking = false)
151 |       // count the iteration
152 |       i += 1
153 |     }
154 | 
155 |     g
156 |   } // end of apply
157 | 
158 | } // end of class Pregel
159 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/VertexRDD.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark._
 23 | import org.apache.spark.SparkContext._
 24 | import org.apache.spark.rdd._
 25 | import org.apache.spark.storage.StorageLevel
 26 | 
 27 | import org.apache.spark.graphx.impl.RoutingTablePartition
 28 | import org.apache.spark.graphx.impl.ShippableVertexPartition
 29 | import org.apache.spark.graphx.impl.VertexAttributeBlock
 30 | import org.apache.spark.graphx.impl.VertexRDDImpl
 31 | 
 32 | /**
 33 |  * Extends `RDD[(VertexId, VD)]` by ensuring that there is only one entry for each vertex and by
 34 |  * pre-indexing the entries for fast, efficient joins. Two VertexRDDs with the same index can be
 35 |  * joined efficiently. All operations except [[reindex]] preserve the index. To construct a
 36 |  * `VertexRDD`, use the [[org.apache.spark.graphx.VertexRDD$ VertexRDD object]].
 37 |  *
 38 |  * Additionally, stores routing information to enable joining the vertex attributes with an
 39 |  * [[EdgeRDD]].
 40 |  *
 41 |  * @example Construct a `VertexRDD` from a plain RDD:
 42 |  * {{{
 43 |  * // Construct an initial vertex set
 44 |  * val someData: RDD[(VertexId, SomeType)] = loadData(someFile)
 45 |  * val vset = VertexRDD(someData)
 46 |  * // If there were redundant values in someData we would use a reduceFunc
 47 |  * val vset2 = VertexRDD(someData, reduceFunc)
 48 |  * // Finally we can use the VertexRDD to index another dataset
 49 |  * val otherData: RDD[(VertexId, OtherType)] = loadData(otherFile)
 50 |  * val vset3 = vset2.innerJoin(otherData) { (vid, a, b) => b }
 51 |  * // Now we can construct very fast joins between the two sets
 52 |  * val vset4: VertexRDD[(SomeType, OtherType)] = vset.leftJoin(vset3)
 53 |  * }}}
 54 |  *
 55 |  * @tparam VD the vertex attribute associated with each vertex in the set.
 56 |  */
 57 | abstract class VertexRDD[VD](
 58 |     sc: SparkContext,
 59 |     deps: Seq[Dependency[_]]) extends RDD[(VertexId, VD)](sc, deps) {
 60 | 
 61 |   implicit protected def vdTag: ClassTag[VD]
 62 | 
 63 |   private[graphx] def partitionsRDD: RDD[ShippableVertexPartition[VD]]
 64 | 
 65 |   override protected def getPartitions: Array[Partition] = partitionsRDD.partitions
 66 | 
 67 |   /**
 68 |    * Provides the `RDD[(VertexId, VD)]` equivalent output.
 69 |    */
 70 |   override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = {
 71 |     firstParent[ShippableVertexPartition[VD]].iterator(part, context).next().iterator
 72 |   }
 73 | 
 74 |   /**
 75 |    * Construct a new VertexRDD that is indexed by only the visible vertices. The resulting
 76 |    * VertexRDD will be based on a different index and can no longer be quickly joined with this
 77 |    * RDD.
 78 |    */
 79 |   def reindex(): VertexRDD[VD]
 80 | 
 81 |   /**
 82 |    * Applies a function to each `VertexPartition` of this RDD and returns a new VertexRDD.
 83 |    * 对当前RDD的每个分区进行函数变换得到一个新的VertexRDD
 84 |    */
 85 |   private[graphx] def mapVertexPartitions[VD2: ClassTag](
 86 |       f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2])
 87 |     : VertexRDD[VD2]
 88 | 
 89 |   /**
 90 |    * Restricts the vertex set to the set of vertices satisfying the given predicate. This operation
 91 |    * preserves the index for efficient joins with the original RDD, and it sets bits in the bitmask
 92 |    * rather than allocating new memory.
 93 |    *
 94 |    * It is declared and defined here to allow refining the return type from `RDD[(VertexId, VD)]` to
 95 |    * `VertexRDD[VD]`.
 96 |    *
 97 |    * @param pred the user defined predicate, which takes a tuple to conform to the
 98 |    * `RDD[(VertexId, VD)]` interface
 99 |    */
100 |   override def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD] =
101 |     this.mapVertexPartitions(_.filter(Function.untupled(pred)))
102 | 
103 |   /**
104 |    * Maps each vertex attribute, preserving the index.
105 |    *
106 |    * @tparam VD2 the type returned by the map function
107 |    *
108 |    * @param f the function applied to each value in the RDD
109 |    * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the
110 |    * original VertexRDD
111 |    */
112 |   def mapValues[VD2: ClassTag](f: VD => VD2): VertexRDD[VD2]
113 | 
114 |   /**
115 |    * Maps each vertex attribute, additionally supplying the vertex ID.
116 |    *
117 |    * @tparam VD2 the type returned by the map function
118 |    *
119 |    * @param f the function applied to each ID-value pair in the RDD
120 |    * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the
121 |    * original VertexRDD.  The resulting VertexRDD retains the same index.
122 |    */
123 |   def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2]
124 | 
125 |   /**
126 |    * For each VertexId present in both `this` and `other`, minus will act as a set difference
127 |    * operation returning only those unique VertexId's present in `this`.
128 |    *根据VertexId做差集， VertexRDD[VD]和RDD[(VertexId, VD)]底层存储是一致的，
129 |    * VertexRDD[VD]是RDD[(VertexId, VD)]的子类
130 |    * @param other an RDD to run the set operation against
131 |    */
132 |   def minus(other: RDD[(VertexId, VD)]): VertexRDD[VD]
133 | 
134 |   /**
135 |    * For each VertexId present in both `this` and `other`, minus will act as a set difference
136 |    * operation returning only those unique VertexId's present in `this`.
137 |    *根据VertexId做差集
138 |    * @param other a VertexRDD to run the set operation against
139 |    */
140 |   def minus(other: VertexRDD[VD]): VertexRDD[VD]
141 | 
142 |   /**
143 |    * For each vertex present in both `this` and `other`, `diff` returns only those vertices with
144 |    * differing values; for values that are different, keeps the values from `other`. This is
145 |    * only guaranteed to work if the VertexRDDs share a common ancestor.
146 |    *去掉this和other中有相同值的点。如果冲突留下other的
147 |    * @param other the other RDD[(VertexId, VD)] with which to diff against.
148 |    */
149 |   def diff(other: RDD[(VertexId, VD)]): VertexRDD[VD]
150 | 
151 |   /**
152 |    * For each vertex present in both `this` and `other`, `diff` returns only those vertices with
153 |    * differing values; for values that are different, keeps the values from `other`. This is
154 |    * only guaranteed to work if the VertexRDDs share a common ancestor.
155 |    *去掉this和other中有相同值的点
156 |    * @param other the other VertexRDD with which to diff against.
157 |    */
158 |   def diff(other: VertexRDD[VD]): VertexRDD[VD]
159 | 
160 |   /**
161 |    * Left joins this RDD with anotther VertexRDD with the same index. This function will fail if
162 |    * both VertexRDDs do not share the same index. The resuling vertex set contains an entry for
163 |    * each vertex in `this`.
164 |    * 对相同index的元素做连接，左边有而右边没有的，右边返回None
165 |    * If `other` is missing any vertex in this VertexRDD, `f` is passed `None`.
166 |    *
167 |    * @tparam VD2 the attribute type of the other VertexRDD
168 |    *             other VertexRDD中属性的类型
169 |    * @tparam VD3 the attribute type of the resulting VertexRDD
170 |    * @param other the other VertexRDD with which to join.
171 |    * @param f the function mapping a vertex id and its attributes in this and the other vertex set
172 |    * to a new vertex attribute.
173 |    * @return a VertexRDD containing the results of `f`
174 |    */
175 |   def leftZipJoin[VD2: ClassTag, VD3: ClassTag]
176 |       (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3]
177 | 
178 |   /**
179 |    * Left joins this VertexRDD with an RDD containing vertex attribute pairs. If the other RDD is
180 |    * backed by a VertexRDD with the same index then the efficient [[leftZipJoin]] implementation is
181 |    * used. The resulting VertexRDD contains an entry for each vertex in `this`. If `other` is
182 |    * missing any vertex in this VertexRDD, `f` is passed `None`. If there are duplicates,
183 |    * the vertex is picked arbitrarily.如果有重复会从中随意挑选一个。
184 |    *
185 |    * @tparam VD2 the attribute type of the other VertexRDD
186 |    * @tparam VD3 the attribute type of the resulting VertexRDD
187 |    *
188 |    * @param other the other VertexRDD with which to join
189 |    * @param f the function mapping a vertex id and its attributes in this and the other vertex set
190 |    * to a new vertex attribute.
191 |    * @return a VertexRDD containing all the vertices in this VertexRDD with the attributes emitted （发出，放出）
192 |    * by `f`.
193 |    */
194 |   def leftJoin[VD2: ClassTag, VD3: ClassTag]
195 |       (other: RDD[(VertexId, VD2)])
196 |       (f: (VertexId, VD, Option[VD2]) => VD3)
197 |     : VertexRDD[VD3]
198 | 
199 |   /**
200 |    * Efficiently inner joins this VertexRDD with another VertexRDD sharing the same index. See
201 |    * [[innerJoin]] for the behavior of the join.
202 |    */
203 |   def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U])
204 |       (f: (VertexId, VD, U) => VD2): VertexRDD[VD2]
205 | 
206 |   /**
207 |    * Inner joins this VertexRDD with an RDD containing vertex attribute pairs. If the other RDD is
208 |    * backed by a VertexRDD with the same index then the efficient [[innerZipJoin]] implementation
209 |    * is used.
210 |    *
211 |    * @param other an RDD containing vertices to join. If there are multiple entries for the same
212 |    * vertex, one is picked arbitrarily. Use [[aggregateUsingIndex]] to merge multiple entries.
213 |    * @param f the join function applied to corresponding values of `this` and `other`
214 |    * @return a VertexRDD co-indexed with `this`, containing only vertices that appear in both
215 |    *         `this` and `other`, with values supplied by `f`
216 |    */
217 |   def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
218 |       (f: (VertexId, VD, U) => VD2): VertexRDD[VD2]
219 | 
220 |   /**
221 |    * Aggregates vertices in `messages` that have the same ids using `reduceFunc`, returning a
222 |    * VertexRDD co-indexed with `this`.
223 |    *
224 |    * @param messages an RDD containing messages to aggregate, where each message is a pair of its
225 |    * target vertex ID and the message data
226 |    * @param reduceFunc the associative aggregation function for merging messages to the same vertex
227 |    * @return a VertexRDD co-indexed with `this`, containing only vertices that received messages.
228 |    * For those vertices, their values are the result of applying `reduceFunc` to all received
229 |    * messages.
230 |    */
231 |   def aggregateUsingIndex[VD2: ClassTag](
232 |       messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2]
233 | 
234 |   /**
235 |    * Returns a new `VertexRDD` reflecting a reversal of all edge directions in the corresponding
236 |    * [[EdgeRDD]].
237 |    * 把相应的EdgeRDD进行反向得到的新的VertexRDD
238 |    */
239 |   def reverseRoutingTables(): VertexRDD[VD]
240 | 
241 |   /** Prepares this VertexRDD for efficient joins with the given EdgeRDD. */
242 |   def withEdges(edges: EdgeRDD[_]): VertexRDD[VD]
243 | 
244 |   /** Replaces the vertex partitions while preserving all other properties of the VertexRDD. */
245 |   private[graphx] def withPartitionsRDD[VD2: ClassTag](
246 |       partitionsRDD: RDD[ShippableVertexPartition[VD2]]): VertexRDD[VD2]
247 | 
248 |   /**
249 |    * Changes the target storage level while preserving all other properties of the
250 |    * VertexRDD. Operations on the returned VertexRDD will preserve this storage level.
251 |    *改变存储级别。
252 |    * This does not actually trigger a cache; to do this, call
253 |    * [[org.apache.spark.graphx.VertexRDD#cache]] on the returned VertexRDD.
254 |    */
255 |   private[graphx] def withTargetStorageLevel(
256 |       targetStorageLevel: StorageLevel): VertexRDD[VD]
257 | 
258 |   /** Generates an RDD of vertex attributes suitable for shipping to the edge partitions. */
259 |   private[graphx] def shipVertexAttributes(
260 |       shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, VertexAttributeBlock[VD])]
261 | 
262 |   /** Generates an RDD of vertex IDs suitable for shipping to the edge partitions. */
263 |   private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])]
264 | 
265 | } // end of VertexRDD
266 | 
267 | 
268 | /**
269 |  * The VertexRDD singleton is used to construct VertexRDDs.
270 |  */
271 | object VertexRDD {
272 | 
273 |   /**
274 |    * Constructs a standalone `VertexRDD` (one that is not set up for efficient joins with an
275 |    * [[EdgeRDD]]) from an RDD of vertex-attribute pairs. Duplicate entries are removed arbitrarily.
276 |    *赋值操作
277 |    * @tparam VD the vertex attribute type
278 |    *
279 |    * @param vertices the collection of vertex-attribute pairs
280 |    */
281 |   def apply[VD: ClassTag](vertices: RDD[(VertexId, VD)]): VertexRDD[VD] = {
282 |     val vPartitioned: RDD[(VertexId, VD)] = vertices.partitioner match {
283 |       case Some(p) => vertices
284 |       case None => vertices.partitionBy(new HashPartitioner(vertices.partitions.size))
285 |     }
286 |     val vertexPartitions = vPartitioned.mapPartitions(
287 |       iter => Iterator(ShippableVertexPartition(iter)),
288 |       preservesPartitioning = true)
289 |     new VertexRDDImpl(vertexPartitions)
290 |   }
291 | 
292 |   /**
293 |    * Constructs a `VertexRDD` from an RDD of vertex-attribute pairs. Duplicate vertex entries are
294 |    * removed arbitrarily. The resulting `VertexRDD` will be joinable with `edges`, and any missing
295 |    * vertices referred to by `edges` will be created with the attribute `defaultVal`.
296 |    *
297 |    * @tparam VD the vertex attribute type
298 |    *
299 |    * @param vertices the collection of vertex-attribute pairs
300 |    * @param edges the [[EdgeRDD]] that these vertices may be joined with
301 |    * @param defaultVal the vertex attribute to use when creating missing vertices
302 |    */
303 |   def apply[VD: ClassTag](
304 |       vertices: RDD[(VertexId, VD)], edges: EdgeRDD[_], defaultVal: VD): VertexRDD[VD] = {
305 |     VertexRDD(vertices, edges, defaultVal, (a, b) => a)
306 |   }
307 | 
308 |   /**
309 |    * Constructs a `VertexRDD` from an RDD of vertex-attribute pairs. Duplicate vertex entries are
310 |    * merged using `mergeFunc`. The resulting `VertexRDD` will be joinable with `edges`, and any
311 |    * missing vertices referred to by `edges` will be created with the attribute `defaultVal`.
312 |    *
313 |    * @tparam VD the vertex attribute type
314 |    *
315 |    * @param vertices the collection of vertex-attribute pairs
316 |    * @param edges the [[EdgeRDD]] that these vertices may be joined with
317 |    * @param defaultVal the vertex attribute to use when creating missing vertices
318 |    * @param mergeFunc the commutative, associative duplicate vertex attribute merge function
319 |    */
320 |   def apply[VD: ClassTag](
321 |       vertices: RDD[(VertexId, VD)], edges: EdgeRDD[_], defaultVal: VD, mergeFunc: (VD, VD) => VD
322 |     ): VertexRDD[VD] = {
323 |     val vPartitioned: RDD[(VertexId, VD)] = vertices.partitioner match {
324 |       case Some(p) => vertices
325 |       case None => vertices.partitionBy(new HashPartitioner(vertices.partitions.size))
326 |     }
327 |     val routingTables = createRoutingTables(edges, vPartitioned.partitioner.get)
328 |     val vertexPartitions = vPartitioned.zipPartitions(routingTables, preservesPartitioning = true) {
329 |       (vertexIter, routingTableIter) =>
330 |         val routingTable =
331 |           if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty
332 |         Iterator(ShippableVertexPartition(vertexIter, routingTable, defaultVal, mergeFunc))
333 |     }
334 |     new VertexRDDImpl(vertexPartitions)
335 |   }
336 | 
337 |   /**
338 |    * Constructs a `VertexRDD` containing all vertices referred to in `edges`. The vertices will be
339 |    * created with the attribute `defaultVal`. The resulting `VertexRDD` will be joinable with
340 |    * `edges`.
341 |    *从边生成VertexRDD
342 |    * @tparam VD the vertex attribute type
343 |    *
344 |    * @param edges the [[EdgeRDD]] referring to the vertices to create
345 |    * @param numPartitions the desired number of partitions for the resulting `VertexRDD`
346 |    * @param defaultVal the vertex attribute to use when creating missing vertices
347 |    */
348 |   def fromEdges[VD: ClassTag](
349 |       edges: EdgeRDD[_], numPartitions: Int, defaultVal: VD): VertexRDD[VD] = {
350 |     val routingTables = createRoutingTables(edges, new HashPartitioner(numPartitions))
351 |     val vertexPartitions = routingTables.mapPartitions({ routingTableIter =>
352 |       val routingTable =
353 |         if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty
354 |       Iterator(ShippableVertexPartition(Iterator.empty, routingTable, defaultVal))
355 |     }, preservesPartitioning = true)
356 |     new VertexRDDImpl(vertexPartitions)
357 |   }
358 | 
359 |   private[graphx] def createRoutingTables(
360 |       edges: EdgeRDD[_], vertexPartitioner: Partitioner): RDD[RoutingTablePartition] = {
361 |     // Determine which vertices each edge partition needs by creating a mapping from vid to pid.
362 |     val vid2pid = edges.partitionsRDD.mapPartitions(_.flatMap(
363 |       Function.tupled(RoutingTablePartition.edgePartitionToMsgs)))
364 |       .setName("VertexRDD.createRoutingTables - vid2pid (aggregation)")
365 | 
366 |     val numEdgePartitions = edges.partitions.size
367 |     vid2pid.partitionBy(vertexPartitioner).mapPartitions(
368 |       iter => Iterator(RoutingTablePartition.fromMsgs(numEdgePartitions, iter)),
369 |       preservesPartitioning = true)
370 |   }
371 | }
372 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark.graphx._
 23 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 24 | import org.apache.spark.util.collection.{SortDataFormat, Sorter, PrimitiveVector}
 25 | 
 26 | /** Constructs an EdgePartition from scratch. */
 27 | private[graphx]
 28 | class EdgePartitionBuilder[@specialized(Long, Int, Double) ED: ClassTag, VD: ClassTag](
 29 |     size: Int = 64) {
 30 |   private[this] val edges = new PrimitiveVector[Edge[ED]](size)
 31 | 
 32 |   /** Add a new edge to the partition. */
 33 |   def add(src: VertexId, dst: VertexId, d: ED) {
 34 |     edges += Edge(src, dst, d)
 35 |   }
 36 | 
 37 |   def toEdgePartition: EdgePartition[ED, VD] = {
 38 |     val edgeArray = edges.trim().array
 39 |     new Sorter(Edge.edgeArraySortDataFormat[ED])
 40 |       .sort(edgeArray, 0, edgeArray.length, Edge.lexicographicOrdering)
 41 |     val localSrcIds = new Array[Int](edgeArray.size)
 42 |     val localDstIds = new Array[Int](edgeArray.size)
 43 |     val data = new Array[ED](edgeArray.size)
 44 |     val index = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int]
 45 |     val global2local = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int]
 46 |     val local2global = new PrimitiveVector[VertexId]
 47 |     var vertexAttrs = Array.empty[VD]
 48 |     // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and
 49 |     // adding them to the index. Also populate a map from vertex id to a sequential local offset.
 50 |     if (edgeArray.length > 0) {
 51 |       index.update(edgeArray(0).srcId, 0)
 52 |       var currSrcId: VertexId = edgeArray(0).srcId
 53 |       var currLocalId = -1
 54 |       var i = 0
 55 |       while (i < edgeArray.size) {
 56 |         val srcId = edgeArray(i).srcId
 57 |         val dstId = edgeArray(i).dstId
 58 |         localSrcIds(i) = global2local.changeValue(srcId,
 59 |           { currLocalId += 1; local2global += srcId; currLocalId }, identity)
 60 |         localDstIds(i) = global2local.changeValue(dstId,
 61 |           { currLocalId += 1; local2global += dstId; currLocalId }, identity)
 62 |         data(i) = edgeArray(i).attr
 63 |         if (srcId != currSrcId) {
 64 |           currSrcId = srcId
 65 |           index.update(currSrcId, i)
 66 |         }
 67 | 
 68 |         i += 1
 69 |       }
 70 |       vertexAttrs = new Array[VD](currLocalId + 1)
 71 |     }
 72 |     new EdgePartition(
 73 |       localSrcIds, localDstIds, data, index, global2local, local2global.trim().array, vertexAttrs,
 74 |       None)
 75 |   }
 76 | }
 77 | 
 78 | /**
 79 |  * Constructs an EdgePartition from an existing EdgePartition with the same vertex set. This enables
 80 |  * reuse of the local vertex ids. Intended for internal use in EdgePartition only.
 81 |  */
 82 | private[impl]
 83 | class ExistingEdgePartitionBuilder[
 84 |     @specialized(Long, Int, Double) ED: ClassTag, VD: ClassTag](
 85 |     global2local: GraphXPrimitiveKeyOpenHashMap[VertexId, Int],
 86 |     local2global: Array[VertexId],
 87 |     vertexAttrs: Array[VD],
 88 |     activeSet: Option[VertexSet],
 89 |     size: Int = 64) {
 90 |   private[this] val edges = new PrimitiveVector[EdgeWithLocalIds[ED]](size)
 91 | 
 92 |   /** Add a new edge to the partition. */
 93 |   def add(src: VertexId, dst: VertexId, localSrc: Int, localDst: Int, d: ED) {
 94 |     edges += EdgeWithLocalIds(src, dst, localSrc, localDst, d)
 95 |   }
 96 | 
 97 |   def toEdgePartition: EdgePartition[ED, VD] = {
 98 |     val edgeArray = edges.trim().array
 99 |     new Sorter(EdgeWithLocalIds.edgeArraySortDataFormat[ED])
100 |       .sort(edgeArray, 0, edgeArray.length, EdgeWithLocalIds.lexicographicOrdering)
101 |     val localSrcIds = new Array[Int](edgeArray.size)
102 |     val localDstIds = new Array[Int](edgeArray.size)
103 |     val data = new Array[ED](edgeArray.size)
104 |     val index = new GraphXPrimitiveKeyOpenHashMap[VertexId, Int]
105 |     // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and
106 |     // adding them to the index
107 |     if (edgeArray.length > 0) {
108 |       index.update(edgeArray(0).srcId, 0)
109 |       var currSrcId: VertexId = edgeArray(0).srcId
110 |       var i = 0
111 |       while (i < edgeArray.size) {
112 |         localSrcIds(i) = edgeArray(i).localSrcId
113 |         localDstIds(i) = edgeArray(i).localDstId
114 |         data(i) = edgeArray(i).attr
115 |         if (edgeArray(i).srcId != currSrcId) {
116 |           currSrcId = edgeArray(i).srcId
117 |           index.update(currSrcId, i)
118 |         }
119 |         i += 1
120 |       }
121 |     }
122 | 
123 |     new EdgePartition(
124 |       localSrcIds, localDstIds, data, index, global2local, local2global, vertexAttrs, activeSet)
125 |   }
126 | }
127 | 
128 | private[impl] case class EdgeWithLocalIds[@specialized ED](
129 |     srcId: VertexId, dstId: VertexId, localSrcId: Int, localDstId: Int, attr: ED)
130 | 
131 | private[impl] object EdgeWithLocalIds {
132 |   implicit def lexicographicOrdering[ED]: Ordering[EdgeWithLocalIds[ED]] =
133 |     new Ordering[EdgeWithLocalIds[ED]] {
134 |       override def compare(a: EdgeWithLocalIds[ED], b: EdgeWithLocalIds[ED]): Int = {
135 |         if (a.srcId == b.srcId) {
136 |           if (a.dstId == b.dstId) 0
137 |           else if (a.dstId < b.dstId) -1
138 |           else 1
139 |         } else if (a.srcId < b.srcId) -1
140 |         else 1
141 |       }
142 |     }
143 | 
144 |   private[graphx] def edgeArraySortDataFormat[ED] = {
145 |     new SortDataFormat[EdgeWithLocalIds[ED], Array[EdgeWithLocalIds[ED]]] {
146 |       override def getKey(data: Array[EdgeWithLocalIds[ED]], pos: Int): EdgeWithLocalIds[ED] = {
147 |         data(pos)
148 |       }
149 | 
150 |       override def swap(data: Array[EdgeWithLocalIds[ED]], pos0: Int, pos1: Int): Unit = {
151 |         val tmp = data(pos0)
152 |         data(pos0) = data(pos1)
153 |         data(pos1) = tmp
154 |       }
155 | 
156 |       override def copyElement(
157 |           src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
158 |           dst: Array[EdgeWithLocalIds[ED]], dstPos: Int) {
159 |         dst(dstPos) = src(srcPos)
160 |       }
161 | 
162 |       override def copyRange(
163 |           src: Array[EdgeWithLocalIds[ED]], srcPos: Int,
164 |           dst: Array[EdgeWithLocalIds[ED]], dstPos: Int, length: Int) {
165 |         System.arraycopy(src, srcPos, dst, dstPos, length)
166 |       }
167 | 
168 |       override def allocate(length: Int): Array[EdgeWithLocalIds[ED]] = {
169 |         new Array[EdgeWithLocalIds[ED]](length)
170 |       }
171 |     }
172 |   }
173 | }
174 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/EdgeRDDImpl.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.{classTag, ClassTag}
 21 | 
 22 | import org.apache.spark.{OneToOneDependency, HashPartitioner}
 23 | import org.apache.spark.rdd.RDD
 24 | import org.apache.spark.storage.StorageLevel
 25 | 
 26 | import org.apache.spark.graphx._
 27 | 
 28 | class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
 29 |     @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
 30 |     val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
 31 |   extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
 32 | 
 33 |   override def setName(_name: String): this.type = {
 34 |     if (partitionsRDD.name != null) {
 35 |       partitionsRDD.setName(partitionsRDD.name + ", " + _name)
 36 |     } else {
 37 |       partitionsRDD.setName(_name)
 38 |     }
 39 |     this
 40 |   }
 41 |   setName("EdgeRDD")
 42 | 
 43 |   /**
 44 |    * If `partitionsRDD` already has a partitioner, use it. Otherwise assume that the
 45 |    * [[PartitionID]]s in `partitionsRDD` correspond to the actual partitions and create a new
 46 |    * partitioner that allows co-partitioning with `partitionsRDD`.
 47 |    */
 48 |   override val partitioner =
 49 |     partitionsRDD.partitioner.orElse(Some(new HashPartitioner(partitions.size)))
 50 | 
 51 |   override def collect(): Array[Edge[ED]] = this.map(_.copy()).collect()
 52 | 
 53 |   /**
 54 |    * Persists the edge partitions at the specified storage level, ignoring any existing target
 55 |    * storage level.
 56 |    */
 57 |   override def persist(newLevel: StorageLevel): this.type = {
 58 |     partitionsRDD.persist(newLevel)
 59 |     this
 60 |   }
 61 | 
 62 |   override def unpersist(blocking: Boolean = true): this.type = {
 63 |     partitionsRDD.unpersist(blocking)
 64 |     this
 65 |   }
 66 | 
 67 |   /** Persists the edge partitions using `targetStorageLevel`, which defaults to MEMORY_ONLY. */
 68 |   override def cache(): this.type = {
 69 |     partitionsRDD.persist(targetStorageLevel)
 70 |     this
 71 |   }
 72 | 
 73 |   override def getStorageLevel: StorageLevel = partitionsRDD.getStorageLevel
 74 | 
 75 |   override def checkpoint(): Unit = {
 76 |     partitionsRDD.checkpoint()
 77 |   }
 78 | 
 79 |   override def isCheckpointed: Boolean = {
 80 |     firstParent[(PartitionID, EdgePartition[ED, VD])].isCheckpointed
 81 |   }
 82 | 
 83 |   override def getCheckpointFile: Option[String] = {
 84 |     partitionsRDD.getCheckpointFile
 85 |   }
 86 | 
 87 |   /** The number of edges in the RDD. */
 88 |   override def count(): Long = {
 89 |     partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
 90 |   }
 91 | 
 92 |   override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
 93 |     mapEdgePartitions((pid, part) => part.map(f))
 94 | 
 95 |   override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)
 96 | 
 97 |   def filter(
 98 |       epred: EdgeTriplet[VD, ED] => Boolean,
 99 |       vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
100 |     mapEdgePartitions((pid, part) => part.filter(epred, vpred))
101 |   }
102 | 
103 |   override def innerJoin[ED2: ClassTag, ED3: ClassTag]
104 |       (other: EdgeRDD[ED2])
105 |       (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
106 |     val ed2Tag = classTag[ED2]
107 |     val ed3Tag = classTag[ED3]
108 |     this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
109 |       (thisIter, otherIter) =>
110 |         val (pid, thisEPart) = thisIter.next()
111 |         val (_, otherEPart) = otherIter.next()
112 |         Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
113 |     })
114 |   }
115 | 
116 |   def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
117 |       f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
118 |     this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
119 |       if (iter.hasNext) {
120 |         val (pid, ep) = iter.next()
121 |         Iterator(Tuple2(pid, f(pid, ep)))
122 |       } else {
123 |         Iterator.empty
124 |       }
125 |     }, preservesPartitioning = true))
126 |   }
127 | 
128 |   private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
129 |       partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
130 |     new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
131 |   }
132 | 
133 |   override private[graphx] def withTargetStorageLevel(
134 |       targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
135 |     new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
136 |   }
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.{classTag, ClassTag}
 21 | 
 22 | import org.apache.spark.HashPartitioner
 23 | import org.apache.spark.SparkContext._
 24 | import org.apache.spark.rdd.{RDD, ShuffledRDD}
 25 | import org.apache.spark.storage.StorageLevel
 26 | import org.apache.spark.graphx._
 27 | import org.apache.spark.graphx.impl.GraphImpl._
 28 | import org.apache.spark.graphx.util.BytecodeUtils
 29 | 
 30 | 
 31 | /**
 32 |  * An implementation of [[org.apache.spark.graphx.Graph]] to support computation on graphs.
 33 |  *
 34 |  * Graphs are represented using two RDDs: `vertices`, which contains vertex attributes and the
 35 |  * routing information for shipping vertex attributes to edge partitions, and
 36 |  * `replicatedVertexView`, which contains edges and the vertex attributes mentioned by each edge.
 37 |  */
 38 | class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 39 |     @transient val vertices: VertexRDD[VD],
 40 |     @transient val replicatedVertexView: ReplicatedVertexView[VD, ED])
 41 |   extends Graph[VD, ED] with Serializable {
 42 | 
 43 |   /** Default constructor is provided to support serialization */
 44 |   protected def this() = this(null, null)
 45 | 
 46 |   @transient override val edges: EdgeRDDImpl[ED, VD] = replicatedVertexView.edges
 47 | 
 48 |   /** Return a RDD that brings edges together with their source and destination vertices. */
 49 |   @transient override lazy val triplets: RDD[EdgeTriplet[VD, ED]] = {
 50 |     replicatedVertexView.upgrade(vertices, true, true)
 51 |     replicatedVertexView.edges.partitionsRDD.mapPartitions(_.flatMap {
 52 |       case (pid, part) => part.tripletIterator()
 53 |     })
 54 |   }
 55 | 
 56 |   override def persist(newLevel: StorageLevel): Graph[VD, ED] = {
 57 |     vertices.persist(newLevel)
 58 |     replicatedVertexView.edges.persist(newLevel)
 59 |     this
 60 |   }
 61 | 
 62 |   override def cache(): Graph[VD, ED] = {
 63 |     vertices.cache()
 64 |     replicatedVertexView.edges.cache()
 65 |     this
 66 |   }
 67 | 
 68 |   override def checkpoint(): Unit = {
 69 |     vertices.checkpoint()
 70 |     replicatedVertexView.edges.checkpoint()
 71 |   }
 72 | 
 73 |   override def isCheckpointed: Boolean = {
 74 |     vertices.isCheckpointed && replicatedVertexView.edges.isCheckpointed
 75 |   }
 76 | 
 77 |   override def getCheckpointFiles: Seq[String] = {
 78 |     Seq(vertices.getCheckpointFile, replicatedVertexView.edges.getCheckpointFile).flatMap {
 79 |       case Some(path) => Seq(path)
 80 |       case None => Seq()
 81 |     }
 82 |   }
 83 | 
 84 |   override def unpersist(blocking: Boolean = true): Graph[VD, ED] = {
 85 |     unpersistVertices(blocking)
 86 |     replicatedVertexView.edges.unpersist(blocking)
 87 |     this
 88 |   }
 89 | 
 90 |   override def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] = {
 91 |     vertices.unpersist(blocking)
 92 |     // TODO: unpersist the replicated vertices in `replicatedVertexView` but leave the edges alone
 93 |     this
 94 |   }
 95 | 
 96 |   override def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] = {
 97 |     partitionBy(partitionStrategy, edges.partitions.size)
 98 |   }
 99 | 
100 |   override def partitionBy(
101 |       partitionStrategy: PartitionStrategy, numPartitions: Int): Graph[VD, ED] = {
102 |     val edTag = classTag[ED]
103 |     val vdTag = classTag[VD]
104 |     val newEdges = edges.withPartitionsRDD(edges.map { e =>
105 |       val part: PartitionID = partitionStrategy.getPartition(e.srcId, e.dstId, numPartitions)
106 |       (part, (e.srcId, e.dstId, e.attr))
107 |     }
108 |       .partitionBy(new HashPartitioner(numPartitions))
109 |       .mapPartitionsWithIndex( { (pid, iter) =>
110 |         val builder = new EdgePartitionBuilder[ED, VD]()(edTag, vdTag)
111 |         iter.foreach { message =>
112 |           val data = message._2
113 |           builder.add(data._1, data._2, data._3)
114 |         }
115 |         val edgePartition = builder.toEdgePartition
116 |         Iterator((pid, edgePartition))
117 |       }, preservesPartitioning = true)).cache()
118 |     GraphImpl.fromExistingRDDs(vertices.withEdges(newEdges), newEdges)
119 |   }
120 | 
121 |   override def reverse: Graph[VD, ED] = {
122 |     new GraphImpl(vertices.reverseRoutingTables(), replicatedVertexView.reverse())
123 |   }
124 | 
125 |   override def mapVertices[VD2: ClassTag]
126 |     (f: (VertexId, VD) => VD2)(implicit eq: VD =:= VD2 = null): Graph[VD2, ED] = {
127 |     // The implicit parameter eq will be populated by the compiler if VD and VD2 are equal, and left
128 |     // null if not
129 |     if (eq != null) {
130 |       vertices.cache()
131 |       // The map preserves type, so we can use incremental replication
132 |       val newVerts = vertices.mapVertexPartitions(_.map(f)).cache()
133 |       val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts)
134 |       val newReplicatedVertexView = replicatedVertexView.asInstanceOf[ReplicatedVertexView[VD2, ED]]
135 |         .updateVertices(changedVerts)
136 |       new GraphImpl(newVerts, newReplicatedVertexView)
137 |     } else {
138 |       // The map does not preserve type, so we must re-replicate all vertices
139 |       GraphImpl(vertices.mapVertexPartitions(_.map(f)), replicatedVertexView.edges)
140 |     }
141 |   }
142 | 
143 |   override def mapEdges[ED2: ClassTag](
144 |       f: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2] = {
145 |     val newEdges = replicatedVertexView.edges
146 |       .mapEdgePartitions((pid, part) => part.map(f(pid, part.iterator)))
147 |     new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges))
148 |   }
149 | 
150 |   override def mapTriplets[ED2: ClassTag](
151 |       f: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2],
152 |       tripletFields: TripletFields): Graph[VD, ED2] = {
153 |     vertices.cache()
154 |     replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst)
155 |     val newEdges = replicatedVertexView.edges.mapEdgePartitions { (pid, part) =>
156 |       part.map(f(pid, part.tripletIterator(tripletFields.useSrc, tripletFields.useDst)))
157 |     }
158 |     new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges))
159 |   }
160 | 
161 |   override def subgraph(
162 |       epred: EdgeTriplet[VD, ED] => Boolean = x => true,
163 |       vpred: (VertexId, VD) => Boolean = (a, b) => true): Graph[VD, ED] = {
164 |     vertices.cache()
165 |     // Filter the vertices, reusing the partitioner and the index from this graph
166 |     val newVerts = vertices.mapVertexPartitions(_.filter(vpred))
167 |     // Filter the triplets. We must always upgrade the triplet view fully because vpred always runs
168 |     // on both src and dst vertices
169 |     replicatedVertexView.upgrade(vertices, true, true)
170 |     val newEdges = replicatedVertexView.edges.filter(epred, vpred)
171 |     new GraphImpl(newVerts, replicatedVertexView.withEdges(newEdges))
172 |   }
173 | 
174 |   override def mask[VD2: ClassTag, ED2: ClassTag] (
175 |       other: Graph[VD2, ED2]): Graph[VD, ED] = {
176 |     val newVerts = vertices.innerJoin(other.vertices) { (vid, v, w) => v }
177 |     val newEdges = replicatedVertexView.edges.innerJoin(other.edges) { (src, dst, v, w) => v }
178 |     new GraphImpl(newVerts, replicatedVertexView.withEdges(newEdges))
179 |   }
180 | 
181 |   override def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] = {
182 |     val newEdges = replicatedVertexView.edges.mapEdgePartitions(
183 |       (pid, part) => part.groupEdges(merge))
184 |     new GraphImpl(vertices, replicatedVertexView.withEdges(newEdges))
185 |   }
186 | 
187 |   // ///////////////////////////////////////////////////////////////////////////////////////////////
188 |   // Lower level transformation methods
189 |   // ///////////////////////////////////////////////////////////////////////////////////////////////
190 | 
191 |   override def mapReduceTriplets[A: ClassTag](
192 |       mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
193 |       reduceFunc: (A, A) => A,
194 |       activeSetOpt: Option[(VertexRDD[_], EdgeDirection)]): VertexRDD[A] = {
195 | 
196 |     def sendMsg(ctx: EdgeContext[VD, ED, A]) {
197 |       mapFunc(ctx.toEdgeTriplet).foreach { kv =>
198 |         val id = kv._1
199 |         val msg = kv._2
200 |         if (id == ctx.srcId) {
201 |           ctx.sendToSrc(msg)
202 |         } else {
203 |           assert(id == ctx.dstId)
204 |           ctx.sendToDst(msg)
205 |         }
206 |       }
207 |     }
208 | 
209 |     val mapUsesSrcAttr = accessesVertexAttr(mapFunc, "srcAttr")
210 |     val mapUsesDstAttr = accessesVertexAttr(mapFunc, "dstAttr")
211 |     val tripletFields = new TripletFields(mapUsesSrcAttr, mapUsesDstAttr, true)
212 | 
213 |     aggregateMessagesWithActiveSet(sendMsg, reduceFunc, tripletFields, activeSetOpt)
214 |   }
215 | 
216 |   override def aggregateMessagesWithActiveSet[A: ClassTag](
217 |       sendMsg: EdgeContext[VD, ED, A] => Unit,
218 |       mergeMsg: (A, A) => A,
219 |       tripletFields: TripletFields,
220 |       activeSetOpt: Option[(VertexRDD[_], EdgeDirection)]): VertexRDD[A] = {
221 | 
222 |     vertices.cache()
223 |     // For each vertex, replicate its attribute only to partitions where it is
224 |     // in the relevant position in an edge.
225 |     replicatedVertexView.upgrade(vertices, tripletFields.useSrc, tripletFields.useDst)
226 |     val view = activeSetOpt match {
227 |       case Some((activeSet, _)) =>
228 |         replicatedVertexView.withActiveSet(activeSet)
229 |       case None =>
230 |         replicatedVertexView
231 |     }
232 |     val activeDirectionOpt = activeSetOpt.map(_._2)
233 | 
234 |     // Map and combine.
235 |     val preAgg = view.edges.partitionsRDD.mapPartitions(_.flatMap {
236 |       case (pid, edgePartition) =>
237 |         // Choose scan method
238 |         val activeFraction = edgePartition.numActives.getOrElse(0) / edgePartition.indexSize.toFloat
239 |         activeDirectionOpt match {
240 |           case Some(EdgeDirection.Both) =>
241 |             if (activeFraction < 0.8) {
242 |               edgePartition.aggregateMessagesIndexScan(sendMsg, mergeMsg, tripletFields,
243 |                 EdgeActiveness.Both)
244 |             } else {
245 |               edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields,
246 |                 EdgeActiveness.Both)
247 |             }
248 |           case Some(EdgeDirection.Either) =>
249 |             // TODO: Because we only have a clustered index on the source vertex ID, we can't filter
250 |             // the index here. Instead we have to scan all edges and then do the filter.
251 |             edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields,
252 |               EdgeActiveness.Either)
253 |           case Some(EdgeDirection.Out) =>
254 |             if (activeFraction < 0.8) {
255 |               edgePartition.aggregateMessagesIndexScan(sendMsg, mergeMsg, tripletFields,
256 |                 EdgeActiveness.SrcOnly)
257 |             } else {
258 |               edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields,
259 |                 EdgeActiveness.SrcOnly)
260 |             }
261 |           case Some(EdgeDirection.In) =>
262 |             edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields,
263 |               EdgeActiveness.DstOnly)
264 |           case _ => // None
265 |             edgePartition.aggregateMessagesEdgeScan(sendMsg, mergeMsg, tripletFields,
266 |               EdgeActiveness.Neither)
267 |         }
268 |     }).setName("GraphImpl.aggregateMessages - preAgg")
269 | 
270 |     // do the final reduction reusing the index map
271 |     vertices.aggregateUsingIndex(preAgg, mergeMsg)
272 |   }
273 | 
274 |   override def outerJoinVertices[U: ClassTag, VD2: ClassTag]
275 |       (other: RDD[(VertexId, U)])
276 |       (updateF: (VertexId, VD, Option[U]) => VD2)
277 |       (implicit eq: VD =:= VD2 = null): Graph[VD2, ED] = {
278 |     // The implicit parameter eq will be populated by the compiler if VD and VD2 are equal, and left
279 |     // null if not
280 |     if (eq != null) {
281 |       vertices.cache()
282 |       // updateF preserves type, so we can use incremental replication
283 |       val newVerts = vertices.leftJoin(other)(updateF).cache()
284 |       val changedVerts = vertices.asInstanceOf[VertexRDD[VD2]].diff(newVerts)
285 |       val newReplicatedVertexView = replicatedVertexView.asInstanceOf[ReplicatedVertexView[VD2, ED]]
286 |         .updateVertices(changedVerts)
287 |       new GraphImpl(newVerts, newReplicatedVertexView)
288 |     } else {
289 |       // updateF does not preserve type, so we must re-replicate all vertices
290 |       val newVerts = vertices.leftJoin(other)(updateF)
291 |       GraphImpl(newVerts, replicatedVertexView.edges)
292 |     }
293 |   }
294 | 
295 |   /** Test whether the closure accesses the the attribute with name `attrName`. */
296 |   private def accessesVertexAttr(closure: AnyRef, attrName: String): Boolean = {
297 |     try {
298 |       BytecodeUtils.invokedMethod(closure, classOf[EdgeTriplet[VD, ED]], attrName)
299 |     } catch {
300 |       case _: ClassNotFoundException => true // if we don't know, be conservative
301 |     }
302 |   }
303 | } // end of class GraphImpl
304 | 
305 | 
306 | object GraphImpl {
307 | 
308 |   /** Create a graph from edges, setting referenced vertices to `defaultVertexAttr`. */
309 |   def apply[VD: ClassTag, ED: ClassTag](
310 |       edges: RDD[Edge[ED]],
311 |       defaultVertexAttr: VD,
312 |       edgeStorageLevel: StorageLevel,
313 |       vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = {
314 |     fromEdgeRDD(EdgeRDD.fromEdges(edges), defaultVertexAttr, edgeStorageLevel, vertexStorageLevel)
315 |   }
316 | 
317 |   /** Create a graph from EdgePartitions, setting referenced vertices to `defaultVertexAttr`. */
318 |   def fromEdgePartitions[VD: ClassTag, ED: ClassTag](
319 |       edgePartitions: RDD[(PartitionID, EdgePartition[ED, VD])],
320 |       defaultVertexAttr: VD,
321 |       edgeStorageLevel: StorageLevel,
322 |       vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = {
323 |     fromEdgeRDD(EdgeRDD.fromEdgePartitions(edgePartitions), defaultVertexAttr, edgeStorageLevel,
324 |       vertexStorageLevel)
325 |   }
326 | 
327 |   /** Create a graph from vertices and edges, setting missing vertices to `defaultVertexAttr`. */
328 |   def apply[VD: ClassTag, ED: ClassTag](
329 |       vertices: RDD[(VertexId, VD)],
330 |       edges: RDD[Edge[ED]],
331 |       defaultVertexAttr: VD,
332 |       edgeStorageLevel: StorageLevel,
333 |       vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = {
334 |     val edgeRDD = EdgeRDD.fromEdges(edges)(classTag[ED], classTag[VD])
335 |       .withTargetStorageLevel(edgeStorageLevel)
336 |     val vertexRDD = VertexRDD(vertices, edgeRDD, defaultVertexAttr)
337 |       .withTargetStorageLevel(vertexStorageLevel)
338 |     GraphImpl(vertexRDD, edgeRDD)
339 |   }
340 | 
341 |   /**
342 |    * Create a graph from a VertexRDD and an EdgeRDD with arbitrary replicated vertices. The
343 |    * VertexRDD must already be set up for efficient joins with the EdgeRDD by calling
344 |    * `VertexRDD.withEdges` or an appropriate VertexRDD constructor.
345 |    */
346 |   def apply[VD: ClassTag, ED: ClassTag](
347 |       vertices: VertexRDD[VD],
348 |       edges: EdgeRDD[ED]): GraphImpl[VD, ED] = {
349 | 
350 |     vertices.cache()
351 | 
352 |     // Convert the vertex partitions in edges to the correct type
353 |     val newEdges = edges.asInstanceOf[EdgeRDDImpl[ED, _]]
354 |       .mapEdgePartitions((pid, part) => part.withoutVertexAttributes[VD])
355 |       .cache()
356 | 
357 |     GraphImpl.fromExistingRDDs(vertices, newEdges)
358 |   }
359 | 
360 |   /**
361 |    * Create a graph from a VertexRDD and an EdgeRDD with the same replicated vertex type as the
362 |    * vertices. The VertexRDD must already be set up for efficient joins with the EdgeRDD by calling
363 |    * `VertexRDD.withEdges` or an appropriate VertexRDD constructor.
364 |    */
365 |   def fromExistingRDDs[VD: ClassTag, ED: ClassTag](
366 |       vertices: VertexRDD[VD],
367 |       edges: EdgeRDD[ED]): GraphImpl[VD, ED] = {
368 |     new GraphImpl(vertices, new ReplicatedVertexView(edges.asInstanceOf[EdgeRDDImpl[ED, VD]]))
369 |   }
370 | 
371 |   /**
372 |    * Create a graph from an EdgeRDD with the correct vertex type, setting missing vertices to
373 |    * `defaultVertexAttr`. The vertices will have the same number of partitions as the EdgeRDD.
374 |    */
375 |   private def fromEdgeRDD[VD: ClassTag, ED: ClassTag](
376 |       edges: EdgeRDDImpl[ED, VD],
377 |       defaultVertexAttr: VD,
378 |       edgeStorageLevel: StorageLevel,
379 |       vertexStorageLevel: StorageLevel): GraphImpl[VD, ED] = {
380 |     val edgesCached = edges.withTargetStorageLevel(edgeStorageLevel).cache()
381 |     val vertices = VertexRDD.fromEdges(edgesCached, edgesCached.partitions.size, defaultVertexAttr)
382 |       .withTargetStorageLevel(vertexStorageLevel)
383 |     fromExistingRDDs(vertices, edgesCached)
384 |   }
385 | 
386 | } // end of object GraphImpl
387 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.{classTag, ClassTag}
 21 | 
 22 | import org.apache.spark.SparkContext._
 23 | import org.apache.spark.rdd.RDD
 24 | 
 25 | import org.apache.spark.graphx._
 26 | 
 27 | /**
 28 |  * Manages shipping vertex attributes to the edge partitions of an
 29 |  * [[org.apache.spark.graphx.EdgeRDD]]. Vertex attributes may be partially shipped to construct a
 30 |  * triplet view with vertex attributes on only one side, and they may be updated. An active vertex
 31 |  * set may additionally be shipped to the edge partitions. Be careful not to store a reference to
 32 |  * `edges`, since it may be modified when the attribute shipping level is upgraded.
 33 |  */
 34 | private[impl]
 35 | class ReplicatedVertexView[VD: ClassTag, ED: ClassTag](
 36 |     var edges: EdgeRDDImpl[ED, VD],
 37 |     var hasSrcId: Boolean = false,
 38 |     var hasDstId: Boolean = false) {
 39 | 
 40 |   /**
 41 |    * Return a new `ReplicatedVertexView` with the specified `EdgeRDD`, which must have the same
 42 |    * shipping level.
 43 |    */
 44 |   def withEdges[VD2: ClassTag, ED2: ClassTag](
 45 |       edges_ : EdgeRDDImpl[ED2, VD2]): ReplicatedVertexView[VD2, ED2] = {
 46 |     new ReplicatedVertexView(edges_, hasSrcId, hasDstId)
 47 |   }
 48 | 
 49 |   /**
 50 |    * Return a new `ReplicatedVertexView` where edges are reversed and shipping levels are swapped to
 51 |    * match.
 52 |    */
 53 |   def reverse(): ReplicatedVertexView[VD, ED] = {
 54 |     val newEdges = edges.mapEdgePartitions((pid, part) => part.reverse)
 55 |     new ReplicatedVertexView(newEdges, hasDstId, hasSrcId)
 56 |   }
 57 | 
 58 |   /**
 59 |    * Upgrade the shipping level in-place to the specified levels by shipping vertex attributes from
 60 |    * `vertices`. This operation modifies the `ReplicatedVertexView`, and callers can access `edges`
 61 |    * afterwards to obtain the upgraded view.
 62 |    */
 63 |   def upgrade(vertices: VertexRDD[VD], includeSrc: Boolean, includeDst: Boolean) {
 64 |     val shipSrc = includeSrc && !hasSrcId
 65 |     val shipDst = includeDst && !hasDstId
 66 |     if (shipSrc || shipDst) {
 67 |       val shippedVerts: RDD[(Int, VertexAttributeBlock[VD])] =
 68 |         vertices.shipVertexAttributes(shipSrc, shipDst)
 69 |           .setName("ReplicatedVertexView.upgrade(%s, %s) - shippedVerts %s %s (broadcast)".format(
 70 |             includeSrc, includeDst, shipSrc, shipDst))
 71 |           .partitionBy(edges.partitioner.get)
 72 |       val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) {
 73 |         (ePartIter, shippedVertsIter) => ePartIter.map {
 74 |           case (pid, edgePartition) =>
 75 |             (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator)))
 76 |         }
 77 |       })
 78 |       edges = newEdges
 79 |       hasSrcId = includeSrc
 80 |       hasDstId = includeDst
 81 |     }
 82 |   }
 83 | 
 84 |   /**
 85 |    * Return a new `ReplicatedVertexView` where the `activeSet` in each edge partition contains only
 86 |    * vertex ids present in `actives`. This ships a vertex id to all edge partitions where it is
 87 |    * referenced, ignoring the attribute shipping level.
 88 |    */
 89 |   def withActiveSet(actives: VertexRDD[_]): ReplicatedVertexView[VD, ED] = {
 90 |     val shippedActives = actives.shipVertexIds()
 91 |       .setName("ReplicatedVertexView.withActiveSet - shippedActives (broadcast)")
 92 |       .partitionBy(edges.partitioner.get)
 93 | 
 94 |     val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedActives) {
 95 |       (ePartIter, shippedActivesIter) => ePartIter.map {
 96 |         case (pid, edgePartition) =>
 97 |           (pid, edgePartition.withActiveSet(shippedActivesIter.flatMap(_._2.iterator)))
 98 |       }
 99 |     })
100 |     new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
101 |   }
102 | 
103 |   /**
104 |    * Return a new `ReplicatedVertexView` where vertex attributes in edge partition are updated using
105 |    * `updates`. This ships a vertex attribute only to the edge partitions where it is in the
106 |    * position(s) specified by the attribute shipping level.
107 |    */
108 |   def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = {
109 |     val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId)
110 |       .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format(
111 |         hasSrcId, hasDstId))
112 |       .partitionBy(edges.partitioner.get)
113 | 
114 |     val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) {
115 |       (ePartIter, shippedVertsIter) => ePartIter.map {
116 |         case (pid, edgePartition) =>
117 |           (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator)))
118 |       }
119 |     })
120 |     new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/RoutingTablePartition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark.Partitioner
 23 | import org.apache.spark.rdd.RDD
 24 | import org.apache.spark.rdd.ShuffledRDD
 25 | import org.apache.spark.util.collection.{BitSet, PrimitiveVector}
 26 | 
 27 | import org.apache.spark.graphx._
 28 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 29 | 
 30 | import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage
 31 | 
 32 | private[graphx]
 33 | object RoutingTablePartition {
 34 |   /**
 35 |    * A message from an edge partition to a vertex specifying the position in which the edge
 36 |    * partition references the vertex (src, dst, or both). The edge partition is encoded in the lower
 37 |    * 30 bits of the Int, and the position is encoded in the upper 2 bits of the Int.
 38 |    */
 39 |   type RoutingTableMessage = (VertexId, Int)
 40 | 
 41 |   private def toMessage(vid: VertexId, pid: PartitionID, position: Byte): RoutingTableMessage = {
 42 |     val positionUpper2 = position << 30
 43 |     val pidLower30 = pid & 0x3FFFFFFF
 44 |     (vid, positionUpper2 | pidLower30)
 45 |   }
 46 | 
 47 |   private def vidFromMessage(msg: RoutingTableMessage): VertexId = msg._1
 48 |   private def pidFromMessage(msg: RoutingTableMessage): PartitionID = msg._2 & 0x3FFFFFFF
 49 |   private def positionFromMessage(msg: RoutingTableMessage): Byte = (msg._2 >> 30).toByte
 50 | 
 51 |   val empty: RoutingTablePartition = new RoutingTablePartition(Array.empty)
 52 | 
 53 |   /** Generate a `RoutingTableMessage` for each vertex referenced in `edgePartition`. */
 54 |   def edgePartitionToMsgs(pid: PartitionID, edgePartition: EdgePartition[_, _])
 55 |     : Iterator[RoutingTableMessage] = {
 56 |     // Determine which positions each vertex id appears in using a map where the low 2 bits
 57 |     // represent src and dst
 58 |     val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, Byte]
 59 |     edgePartition.iterator.foreach { e =>
 60 |       map.changeValue(e.srcId, 0x1, (b: Byte) => (b | 0x1).toByte)
 61 |       map.changeValue(e.dstId, 0x2, (b: Byte) => (b | 0x2).toByte)
 62 |     }
 63 |     map.iterator.map { vidAndPosition =>
 64 |       val vid = vidAndPosition._1
 65 |       val position = vidAndPosition._2
 66 |       toMessage(vid, pid, position)
 67 |     }
 68 |   }
 69 | 
 70 |   /** Build a `RoutingTablePartition` from `RoutingTableMessage`s. */
 71 |   def fromMsgs(numEdgePartitions: Int, iter: Iterator[RoutingTableMessage])
 72 |     : RoutingTablePartition = {
 73 |     val pid2vid = Array.fill(numEdgePartitions)(new PrimitiveVector[VertexId])
 74 |     val srcFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean])
 75 |     val dstFlags = Array.fill(numEdgePartitions)(new PrimitiveVector[Boolean])
 76 |     for (msg <- iter) {
 77 |       val vid = vidFromMessage(msg)
 78 |       val pid = pidFromMessage(msg)
 79 |       val position = positionFromMessage(msg)
 80 |       pid2vid(pid) += vid
 81 |       srcFlags(pid) += (position & 0x1) != 0
 82 |       dstFlags(pid) += (position & 0x2) != 0
 83 |     }
 84 | 
 85 |     new RoutingTablePartition(pid2vid.zipWithIndex.map {
 86 |       case (vids, pid) => (vids.trim().array, toBitSet(srcFlags(pid)), toBitSet(dstFlags(pid)))
 87 |     })
 88 |   }
 89 | 
 90 |   /** Compact the given vector of Booleans into a BitSet. */
 91 |   private def toBitSet(flags: PrimitiveVector[Boolean]): BitSet = {
 92 |     val bitset = new BitSet(flags.size)
 93 |     var i = 0
 94 |     while (i < flags.size) {
 95 |       if (flags(i)) {
 96 |         bitset.set(i)
 97 |       }
 98 |       i += 1
 99 |     }
100 |     bitset
101 |   }
102 | }
103 | 
104 | /**
105 |  * Stores the locations of edge-partition join sites for each vertex attribute in a particular
106 |  * vertex partition. This provides routing information for shipping vertex attributes to edge
107 |  * partitions.
108 |  */
109 | private[graphx]
110 | class RoutingTablePartition(
111 |     private val routingTable: Array[(Array[VertexId], BitSet, BitSet)]) extends Serializable {
112 |   /** The maximum number of edge partitions this `RoutingTablePartition` is built to join with. */
113 |   val numEdgePartitions: Int = routingTable.size
114 | 
115 |   /** Returns the number of vertices that will be sent to the specified edge partition. */
116 |   def partitionSize(pid: PartitionID): Int = routingTable(pid)._1.size
117 | 
118 |   /** Returns an iterator over all vertex ids stored in this `RoutingTablePartition`. */
119 |   def iterator: Iterator[VertexId] = routingTable.iterator.flatMap(_._1.iterator)
120 | 
121 |   /** Returns a new RoutingTablePartition reflecting a reversal of all edge directions. */
122 |   def reverse: RoutingTablePartition = {
123 |     new RoutingTablePartition(routingTable.map {
124 |       case (vids, srcVids, dstVids) => (vids, dstVids, srcVids)
125 |     })
126 |   }
127 | 
128 |   /**
129 |    * Runs `f` on each vertex id to be sent to the specified edge partition. Vertex ids can be
130 |    * filtered by the position they have in the edge partition.
131 |    */
132 |   def foreachWithinEdgePartition
133 |       (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean)
134 |       (f: VertexId => Unit) {
135 |     val (vidsCandidate, srcVids, dstVids) = routingTable(pid)
136 |     val size = vidsCandidate.length
137 |     if (includeSrc && includeDst) {
138 |       // Avoid checks for performance
139 |       vidsCandidate.iterator.foreach(f)
140 |     } else if (!includeSrc && !includeDst) {
141 |       // Do nothing
142 |     } else {
143 |       val relevantVids = if (includeSrc) srcVids else dstVids
144 |       relevantVids.iterator.foreach { i => f(vidsCandidate(i)) }
145 |     }
146 |   }
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/ShippableVertexPartition.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark.util.collection.{BitSet, PrimitiveVector}
 23 | 
 24 | import org.apache.spark.graphx._
 25 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 26 | 
 27 | /** Stores vertex attributes to ship to an edge partition. */
 28 | private[graphx]
 29 | class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexId], val attrs: Array[VD])
 30 |   extends Serializable {
 31 |   def iterator: Iterator[(VertexId, VD)] =
 32 |     (0 until vids.size).iterator.map { i => (vids(i), attrs(i)) }
 33 | }
 34 | 
 35 | private[graphx]
 36 | object ShippableVertexPartition {
 37 |   /** Construct a `ShippableVertexPartition` from the given vertices without any routing table. */
 38 |   def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)]): ShippableVertexPartition[VD] =
 39 |     apply(iter, RoutingTablePartition.empty, null.asInstanceOf[VD], (a, b) => a)
 40 | 
 41 |   /**
 42 |    * Construct a `ShippableVertexPartition` from the given vertices with the specified routing
 43 |    * table, filling in missing vertices mentioned in the routing table using `defaultVal`.
 44 |    */
 45 |   def apply[VD: ClassTag](
 46 |       iter: Iterator[(VertexId, VD)], routingTable: RoutingTablePartition, defaultVal: VD)
 47 |     : ShippableVertexPartition[VD] =
 48 |     apply(iter, routingTable, defaultVal, (a, b) => a)
 49 | 
 50 |   /**
 51 |    * Construct a `ShippableVertexPartition` from the given vertices with the specified routing
 52 |    * table, filling in missing vertices mentioned in the routing table using `defaultVal`,
 53 |    * and merging duplicate vertex atrribute with mergeFunc.
 54 |    */
 55 |   def apply[VD: ClassTag](
 56 |       iter: Iterator[(VertexId, VD)], routingTable: RoutingTablePartition, defaultVal: VD,
 57 |       mergeFunc: (VD, VD) => VD): ShippableVertexPartition[VD] = {
 58 |     val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD]
 59 |     // Merge the given vertices using mergeFunc
 60 |     iter.foreach { pair =>
 61 |       map.setMerge(pair._1, pair._2, mergeFunc)
 62 |     }
 63 |     // Fill in missing vertices mentioned in the routing table
 64 |     routingTable.iterator.foreach { vid =>
 65 |       map.changeValue(vid, defaultVal, identity)
 66 |     }
 67 | 
 68 |     new ShippableVertexPartition(map.keySet, map._values, map.keySet.getBitSet, routingTable)
 69 |   }
 70 | 
 71 |   import scala.language.implicitConversions
 72 | 
 73 |   /**
 74 |    * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a
 75 |    * `ShippableVertexPartition`.
 76 |    */
 77 |   implicit def shippablePartitionToOps[VD: ClassTag](partition: ShippableVertexPartition[VD])
 78 |     : ShippableVertexPartitionOps[VD] = new ShippableVertexPartitionOps(partition)
 79 | 
 80 |   /**
 81 |    * Implicit evidence that `ShippableVertexPartition` is a member of the
 82 |    * `VertexPartitionBaseOpsConstructor` typeclass. This enables invoking `VertexPartitionBase`
 83 |    * operations on a `ShippableVertexPartition` via an evidence parameter, as in
 84 |    * [[VertexPartitionBaseOps]].
 85 |    */
 86 |   implicit object ShippableVertexPartitionOpsConstructor
 87 |     extends VertexPartitionBaseOpsConstructor[ShippableVertexPartition] {
 88 |     def toOps[VD: ClassTag](partition: ShippableVertexPartition[VD])
 89 |       : VertexPartitionBaseOps[VD, ShippableVertexPartition] = shippablePartitionToOps(partition)
 90 |   }
 91 | }
 92 | 
 93 | /**
 94 |  * A map from vertex id to vertex attribute that additionally stores edge partition join sites for
 95 |  * each vertex attribute, enabling joining with an [[org.apache.spark.graphx.EdgeRDD]].
 96 |  */
 97 | private[graphx]
 98 | class ShippableVertexPartition[VD: ClassTag](
 99 |     val index: VertexIdToIndexMap,
100 |     val values: Array[VD],
101 |     val mask: BitSet,
102 |     val routingTable: RoutingTablePartition)
103 |   extends VertexPartitionBase[VD] {
104 | 
105 |   /** Return a new ShippableVertexPartition with the specified routing table. */
106 |   def withRoutingTable(routingTable_ : RoutingTablePartition): ShippableVertexPartition[VD] = {
107 |     new ShippableVertexPartition(index, values, mask, routingTable_)
108 |   }
109 | 
110 |   /**
111 |    * Generate a `VertexAttributeBlock` for each edge partition keyed on the edge partition ID. The
112 |    * `VertexAttributeBlock` contains the vertex attributes from the current partition that are
113 |    * referenced in the specified positions in the edge partition.
114 |    */
115 |   def shipVertexAttributes(
116 |       shipSrc: Boolean, shipDst: Boolean): Iterator[(PartitionID, VertexAttributeBlock[VD])] = {
117 |     Iterator.tabulate(routingTable.numEdgePartitions) { pid =>
118 |       val initialSize = if (shipSrc && shipDst) routingTable.partitionSize(pid) else 64
119 |       val vids = new PrimitiveVector[VertexId](initialSize)
120 |       val attrs = new PrimitiveVector[VD](initialSize)
121 |       var i = 0
122 |       routingTable.foreachWithinEdgePartition(pid, shipSrc, shipDst) { vid =>
123 |         if (isDefined(vid)) {
124 |           vids += vid
125 |           attrs += this(vid)
126 |         }
127 |         i += 1
128 |       }
129 |       (pid, new VertexAttributeBlock(vids.trim().array, attrs.trim().array))
130 |     }
131 |   }
132 | 
133 |   /**
134 |    * Generate a `VertexId` array for each edge partition keyed on the edge partition ID. The array
135 |    * contains the visible vertex ids from the current partition that are referenced in the edge
136 |    * partition.
137 |    */
138 |   def shipVertexIds(): Iterator[(PartitionID, Array[VertexId])] = {
139 |     Iterator.tabulate(routingTable.numEdgePartitions) { pid =>
140 |       val vids = new PrimitiveVector[VertexId](routingTable.partitionSize(pid))
141 |       var i = 0
142 |       routingTable.foreachWithinEdgePartition(pid, true, true) { vid =>
143 |         if (isDefined(vid)) {
144 |           vids += vid
145 |         }
146 |         i += 1
147 |       }
148 |       (pid, vids.trim().array)
149 |     }
150 |   }
151 | }
152 | 
153 | private[graphx] class ShippableVertexPartitionOps[VD: ClassTag](self: ShippableVertexPartition[VD])
154 |   extends VertexPartitionBaseOps[VD, ShippableVertexPartition](self) {
155 | 
156 |   def withIndex(index: VertexIdToIndexMap): ShippableVertexPartition[VD] = {
157 |     new ShippableVertexPartition(index, self.values, self.mask, self.routingTable)
158 |   }
159 | 
160 |   def withValues[VD2: ClassTag](values: Array[VD2]): ShippableVertexPartition[VD2] = {
161 |     new ShippableVertexPartition(self.index, values, self.mask, self.routingTable)
162 |   }
163 | 
164 |   def withMask(mask: BitSet): ShippableVertexPartition[VD] = {
165 |     new ShippableVertexPartition(self.index, self.values, mask, self.routingTable)
166 |   }
167 | }
168 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.impl
19 | 
20 | import scala.reflect.ClassTag
21 | 
22 | import org.apache.spark.util.collection.BitSet
23 | 
24 | import org.apache.spark.graphx._
25 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
26 | 
27 | private[graphx] object VertexPartition {
28 |   /** Construct a `VertexPartition` from the given vertices. */
29 |   def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)])
30 |     : VertexPartition[VD] = {
31 |     val (index, values, mask) = VertexPartitionBase.initFrom(iter)
32 |     new VertexPartition(index, values, mask)
33 |   }
34 | 
35 |   import scala.language.implicitConversions
36 | 
37 |   /**
38 |    * Implicit conversion to allow invoking `VertexPartitionBase` operations directly on a
39 |    * `VertexPartition`.
40 |    */
41 |   implicit def partitionToOps[VD: ClassTag](partition: VertexPartition[VD])
42 |     : VertexPartitionOps[VD] = new VertexPartitionOps(partition)
43 | 
44 |   /**
45 |    * Implicit evidence that `VertexPartition` is a member of the `VertexPartitionBaseOpsConstructor`
46 |    * typeclass. This enables invoking `VertexPartitionBase` operations on a `VertexPartition` via an
47 |    * evidence parameter, as in [[VertexPartitionBaseOps]].
48 |    */
49 |   implicit object VertexPartitionOpsConstructor
50 |     extends VertexPartitionBaseOpsConstructor[VertexPartition] {
51 |     def toOps[VD: ClassTag](partition: VertexPartition[VD])
52 |       : VertexPartitionBaseOps[VD, VertexPartition] = partitionToOps(partition)
53 |   }
54 | }
55 | 
56 | /** A map from vertex id to vertex attribute. */
57 | private[graphx] class VertexPartition[VD: ClassTag](
58 |     val index: VertexIdToIndexMap,
59 |     val values: Array[VD],
60 |     val mask: BitSet)
61 |   extends VertexPartitionBase[VD]
62 | 
63 | private[graphx] class VertexPartitionOps[VD: ClassTag](self: VertexPartition[VD])
64 |   extends VertexPartitionBaseOps[VD, VertexPartition](self) {
65 | 
66 |   def withIndex(index: VertexIdToIndexMap): VertexPartition[VD] = {
67 |     new VertexPartition(index, self.values, self.mask)
68 |   }
69 | 
70 |   def withValues[VD2: ClassTag](values: Array[VD2]): VertexPartition[VD2] = {
71 |     new VertexPartition(self.index, values, self.mask)
72 |   }
73 | 
74 |   def withMask(mask: BitSet): VertexPartition[VD] = {
75 |     new VertexPartition(self.index, self.values, mask)
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.impl
19 | 
20 | import scala.language.higherKinds
21 | import scala.reflect.ClassTag
22 | 
23 | import org.apache.spark.util.collection.BitSet
24 | 
25 | import org.apache.spark.graphx._
26 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
27 | 
28 | private[graphx] object VertexPartitionBase {
29 |   /**
30 |    * Construct the constituents of a VertexPartitionBase from the given vertices, merging duplicate
31 |    * entries arbitrarily.
32 |    */
33 |   def initFrom[VD: ClassTag](iter: Iterator[(VertexId, VD)])
34 |     : (VertexIdToIndexMap, Array[VD], BitSet) = {
35 |     val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD]
36 |     iter.foreach { pair =>
37 |       map(pair._1) = pair._2
38 |     }
39 |     (map.keySet, map._values, map.keySet.getBitSet)
40 |   }
41 | 
42 |   /**
43 |    * Construct the constituents of a VertexPartitionBase from the given vertices, merging duplicate
44 |    * entries using `mergeFunc`.
45 |    */
46 |   def initFrom[VD: ClassTag](iter: Iterator[(VertexId, VD)], mergeFunc: (VD, VD) => VD)
47 |     : (VertexIdToIndexMap, Array[VD], BitSet) = {
48 |     val map = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD]
49 |     iter.foreach { pair =>
50 |       map.setMerge(pair._1, pair._2, mergeFunc)
51 |     }
52 |     (map.keySet, map._values, map.keySet.getBitSet)
53 |   }
54 | }
55 | 
56 | /**
57 |  * An abstract map from vertex id to vertex attribute. [[VertexPartition]] is the corresponding
58 |  * concrete implementation. [[VertexPartitionBaseOps]] provides a variety of operations for
59 |  * VertexPartitionBase and subclasses that provide implicit evidence of membership in the
60 |  * `VertexPartitionBaseOpsConstructor` typeclass (for example,
61 |  * [[VertexPartition.VertexPartitionOpsConstructor]]).
62 |  */
63 | private[graphx] abstract class VertexPartitionBase[@specialized(Long, Int, Double) VD: ClassTag]
64 |   extends Serializable {
65 | 
66 |   def index: VertexIdToIndexMap
67 |   def values: Array[VD]
68 |   def mask: BitSet
69 | 
70 |   val capacity: Int = index.capacity
71 | 
72 |   def size: Int = mask.cardinality()
73 | 
74 |   /** Return the vertex attribute for the given vertex ID. */
75 |   def apply(vid: VertexId): VD = values(index.getPos(vid))
76 | 
77 |   def isDefined(vid: VertexId): Boolean = {
78 |     val pos = index.getPos(vid)
79 |     pos >= 0 && mask.get(pos)
80 |   }
81 | 
82 |   def iterator: Iterator[(VertexId, VD)] =
83 |     mask.iterator.map(ind => (index.getValue(ind), values(ind)))
84 | }
85 | 
86 | /**
87 |  * A typeclass for subclasses of `VertexPartitionBase` representing the ability to wrap them in a
88 |  * `VertexPartitionBaseOps`.
89 |  */
90 | private[graphx] trait VertexPartitionBaseOpsConstructor[T[X] <: VertexPartitionBase[X]] {
91 |   def toOps[VD: ClassTag](partition: T[VD]): VertexPartitionBaseOps[VD, T]
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/VertexPartitionBaseOps.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.language.higherKinds
 21 | import scala.language.implicitConversions
 22 | import scala.reflect.ClassTag
 23 | 
 24 | import org.apache.spark.Logging
 25 | import org.apache.spark.util.collection.BitSet
 26 | 
 27 | import org.apache.spark.graphx._
 28 | import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap
 29 | 
 30 | /**
 31 |  * An class containing additional operations for subclasses of VertexPartitionBase that provide
 32 |  * implicit evidence of membership in the `VertexPartitionBaseOpsConstructor` typeclass (for
 33 |  * example, [[VertexPartition.VertexPartitionOpsConstructor]]).
 34 |  */
 35 | private[graphx] abstract class VertexPartitionBaseOps
 36 |     [VD: ClassTag, Self[X] <: VertexPartitionBase[X] : VertexPartitionBaseOpsConstructor]
 37 |     (self: Self[VD])
 38 |   extends Serializable with Logging {
 39 | 
 40 |   def withIndex(index: VertexIdToIndexMap): Self[VD]
 41 |   def withValues[VD2: ClassTag](values: Array[VD2]): Self[VD2]
 42 |   def withMask(mask: BitSet): Self[VD]
 43 | 
 44 |   /**
 45 |    * Pass each vertex attribute along with the vertex id through a map
 46 |    * function and retain the original RDD's partitioning and index.
 47 |    *
 48 |    * @tparam VD2 the type returned by the map function
 49 |    *
 50 |    * @param f the function applied to each vertex id and vertex
 51 |    * attribute in the RDD
 52 |    *
 53 |    * @return a new VertexPartition with values obtained by applying `f` to
 54 |    * each of the entries in the original VertexRDD.  The resulting
 55 |    * VertexPartition retains the same index.
 56 |    */
 57 |   def map[VD2: ClassTag](f: (VertexId, VD) => VD2): Self[VD2] = {
 58 |     // Construct a view of the map transformation
 59 |     val newValues = new Array[VD2](self.capacity)
 60 |     var i = self.mask.nextSetBit(0)
 61 |     while (i >= 0) {
 62 |       newValues(i) = f(self.index.getValue(i), self.values(i))
 63 |       i = self.mask.nextSetBit(i + 1)
 64 |     }
 65 |     this.withValues(newValues)
 66 |   }
 67 | 
 68 |   /**
 69 |    * Restrict the vertex set to the set of vertices satisfying the given predicate.
 70 |    *
 71 |    * @param pred the user defined predicate
 72 |    *
 73 |    * @note The vertex set preserves the original index structure which means that the returned
 74 |    *       RDD can be easily joined with the original vertex-set. Furthermore, the filter only
 75 |    *       modifies the bitmap index and so no new values are allocated.
 76 |    */
 77 |   def filter(pred: (VertexId, VD) => Boolean): Self[VD] = {
 78 |     // Allocate the array to store the results into
 79 |     val newMask = new BitSet(self.capacity)
 80 |     // Iterate over the active bits in the old mask and evaluate the predicate
 81 |     var i = self.mask.nextSetBit(0)
 82 |     while (i >= 0) {
 83 |       if (pred(self.index.getValue(i), self.values(i))) {
 84 |         newMask.set(i)
 85 |       }
 86 |       i = self.mask.nextSetBit(i + 1)
 87 |     }
 88 |     this.withMask(newMask)
 89 |   }
 90 | 
 91 |   /** Hides the VertexId's that are the same between `this` and `other`. */
 92 |   def minus(other: Self[VD]): Self[VD] = {
 93 |     if (self.index != other.index) {
 94 |       logWarning("Minus operations on two VertexPartitions with different indexes is slow.")
 95 |       minus(createUsingIndex(other.iterator))
 96 |     } else {
 97 |       self.withMask(self.mask.andNot(other.mask))
 98 |     }
 99 |   }
100 | 
101 |   /** Hides the VertexId's that are the same between `this` and `other`. */
102 |   def minus(other: Iterator[(VertexId, VD)]): Self[VD] = {
103 |     minus(createUsingIndex(other))
104 |   }
105 | 
106 |   /**
107 |    * Hides vertices that are the same between this and other. For vertices that are different, keeps
108 |    * the values from `other`. The indices of `this` and `other` must be the same.
109 |    */
110 |   def diff(other: Self[VD]): Self[VD] = {
111 |     if (self.index != other.index) {
112 |       logWarning("Diffing two VertexPartitions with different indexes is slow.")
113 |       diff(createUsingIndex(other.iterator))
114 |     } else {
115 |       val newMask = self.mask & other.mask
116 |       var i = newMask.nextSetBit(0)
117 |       while (i >= 0) {
118 |         if (self.values(i) == other.values(i)) {
119 |           newMask.unset(i)
120 |         }
121 |         i = newMask.nextSetBit(i + 1)
122 |       }
123 |       this.withValues(other.values).withMask(newMask)
124 |     }
125 |   }
126 | 
127 |   /** Left outer join another VertexPartition. */
128 |   def leftJoin[VD2: ClassTag, VD3: ClassTag]
129 |       (other: Self[VD2])
130 |       (f: (VertexId, VD, Option[VD2]) => VD3): Self[VD3] = {
131 |     if (self.index != other.index) {
132 |       logWarning("Joining two VertexPartitions with different indexes is slow.")
133 |       leftJoin(createUsingIndex(other.iterator))(f)
134 |     } else {
135 |       val newValues = new Array[VD3](self.capacity)
136 | 
137 |       var i = self.mask.nextSetBit(0)
138 |       while (i >= 0) {
139 |         val otherV: Option[VD2] = if (other.mask.get(i)) Some(other.values(i)) else None
140 |         newValues(i) = f(self.index.getValue(i), self.values(i), otherV)
141 |         i = self.mask.nextSetBit(i + 1)
142 |       }
143 |       this.withValues(newValues)
144 |     }
145 |   }
146 | 
147 |   /** Left outer join another iterator of messages. */
148 |   def leftJoin[VD2: ClassTag, VD3: ClassTag]
149 |       (other: Iterator[(VertexId, VD2)])
150 |       (f: (VertexId, VD, Option[VD2]) => VD3): Self[VD3] = {
151 |     leftJoin(createUsingIndex(other))(f)
152 |   }
153 | 
154 |   /** Inner join another VertexPartition. */
155 |   def innerJoin[U: ClassTag, VD2: ClassTag]
156 |       (other: Self[U])
157 |       (f: (VertexId, VD, U) => VD2): Self[VD2] = {
158 |     if (self.index != other.index) {
159 |       logWarning("Joining two VertexPartitions with different indexes is slow.")
160 |       innerJoin(createUsingIndex(other.iterator))(f)
161 |     } else {
162 |       val newMask = self.mask & other.mask
163 |       val newValues = new Array[VD2](self.capacity)
164 |       var i = newMask.nextSetBit(0)
165 |       while (i >= 0) {
166 |         newValues(i) = f(self.index.getValue(i), self.values(i), other.values(i))
167 |         i = newMask.nextSetBit(i + 1)
168 |       }
169 |       this.withValues(newValues).withMask(newMask)
170 |     }
171 |   }
172 | 
173 |   /**
174 |    * Inner join an iterator of messages.
175 |    */
176 |   def innerJoin[U: ClassTag, VD2: ClassTag]
177 |       (iter: Iterator[Product2[VertexId, U]])
178 |       (f: (VertexId, VD, U) => VD2): Self[VD2] = {
179 |     innerJoin(createUsingIndex(iter))(f)
180 |   }
181 | 
182 |   /**
183 |    * Similar effect as aggregateUsingIndex((a, b) => a)
184 |    */
185 |   def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexId, VD2]])
186 |     : Self[VD2] = {
187 |     val newMask = new BitSet(self.capacity)
188 |     val newValues = new Array[VD2](self.capacity)
189 |     iter.foreach { pair =>
190 |       val pos = self.index.getPos(pair._1)
191 |       if (pos >= 0) {
192 |         newMask.set(pos)
193 |         newValues(pos) = pair._2
194 |       }
195 |     }
196 |     this.withValues(newValues).withMask(newMask)
197 |   }
198 | 
199 |   /**
200 |    * Similar to innerJoin, but vertices from the left side that don't appear in iter will remain in
201 |    * the partition, hidden by the bitmask.
202 |    */
203 |   def innerJoinKeepLeft(iter: Iterator[Product2[VertexId, VD]]): Self[VD] = {
204 |     val newMask = new BitSet(self.capacity)
205 |     val newValues = new Array[VD](self.capacity)
206 |     System.arraycopy(self.values, 0, newValues, 0, newValues.length)
207 |     iter.foreach { pair =>
208 |       val pos = self.index.getPos(pair._1)
209 |       if (pos >= 0) {
210 |         newMask.set(pos)
211 |         newValues(pos) = pair._2
212 |       }
213 |     }
214 |     this.withValues(newValues).withMask(newMask)
215 |   }
216 | 
217 |   def aggregateUsingIndex[VD2: ClassTag](
218 |       iter: Iterator[Product2[VertexId, VD2]],
219 |       reduceFunc: (VD2, VD2) => VD2): Self[VD2] = {
220 |     val newMask = new BitSet(self.capacity)
221 |     val newValues = new Array[VD2](self.capacity)
222 |     iter.foreach { product =>
223 |       val vid = product._1
224 |       val vdata = product._2
225 |       val pos = self.index.getPos(vid)
226 |       if (pos >= 0) {
227 |         if (newMask.get(pos)) {
228 |           newValues(pos) = reduceFunc(newValues(pos), vdata)
229 |         } else { // otherwise just store the new value
230 |           newMask.set(pos)
231 |           newValues(pos) = vdata
232 |         }
233 |       }
234 |     }
235 |     this.withValues(newValues).withMask(newMask)
236 |   }
237 | 
238 |   /**
239 |    * Construct a new VertexPartition whose index contains only the vertices in the mask.
240 |    */
241 |   def reindex(): Self[VD] = {
242 |     val hashMap = new GraphXPrimitiveKeyOpenHashMap[VertexId, VD]
243 |     val arbitraryMerge = (a: VD, b: VD) => a
244 |     for ((k, v) <- self.iterator) {
245 |       hashMap.setMerge(k, v, arbitraryMerge)
246 |     }
247 |     this.withIndex(hashMap.keySet).withValues(hashMap._values).withMask(hashMap.keySet.getBitSet)
248 |   }
249 | 
250 |   /**
251 |    * Converts a vertex partition (in particular, one of type `Self`) into a
252 |    * `VertexPartitionBaseOps`. Within this class, this allows chaining the methods defined above,
253 |    * because these methods return a `Self` and this implicit conversion re-wraps that in a
254 |    * `VertexPartitionBaseOps`. This relies on the context bound on `Self`.
255 |    */
256 |   private implicit def toOps[VD2: ClassTag](partition: Self[VD2])
257 |     : VertexPartitionBaseOps[VD2, Self] = {
258 |     implicitly[VertexPartitionBaseOpsConstructor[Self]].toOps(partition)
259 |   }
260 | }
261 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/VertexRDDImpl.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.impl
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark._
 23 | import org.apache.spark.SparkContext._
 24 | import org.apache.spark.rdd._
 25 | import org.apache.spark.storage.StorageLevel
 26 | 
 27 | import org.apache.spark.graphx._
 28 | 
 29 | class VertexRDDImpl[VD] private[graphx] (
 30 |     @transient val partitionsRDD: RDD[ShippableVertexPartition[VD]],
 31 |     val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
 32 |   (implicit override protected val vdTag: ClassTag[VD])
 33 |   extends VertexRDD[VD](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
 34 | 
 35 |   require(partitionsRDD.partitioner.isDefined)
 36 | 
 37 |   override def reindex(): VertexRDD[VD] = this.withPartitionsRDD(partitionsRDD.map(_.reindex()))
 38 | 
 39 |   override val partitioner = partitionsRDD.partitioner
 40 | 
 41 |   override protected def getPreferredLocations(s: Partition): Seq[String] =
 42 |     partitionsRDD.preferredLocations(s)
 43 | 
 44 |   override def setName(_name: String): this.type = {
 45 |     if (partitionsRDD.name != null) {
 46 |       partitionsRDD.setName(partitionsRDD.name + ", " + _name)
 47 |     } else {
 48 |       partitionsRDD.setName(_name)
 49 |     }
 50 |     this
 51 |   }
 52 |   setName("VertexRDD")
 53 | 
 54 |   /**
 55 |    * Persists the vertex partitions at the specified storage level, ignoring any existing target
 56 |    * storage level.
 57 |    */
 58 |   override def persist(newLevel: StorageLevel): this.type = {
 59 |     partitionsRDD.persist(newLevel)
 60 |     this
 61 |   }
 62 | 
 63 |   override def unpersist(blocking: Boolean = true): this.type = {
 64 |     partitionsRDD.unpersist(blocking)
 65 |     this
 66 |   }
 67 | 
 68 |   /** Persists the vertex partitions at `targetStorageLevel`, which defaults to MEMORY_ONLY. */
 69 |   override def cache(): this.type = {
 70 |     partitionsRDD.persist(targetStorageLevel)
 71 |     this
 72 |   }
 73 | 
 74 |   override def getStorageLevel: StorageLevel = partitionsRDD.getStorageLevel
 75 | 
 76 |   override def checkpoint(): Unit = {
 77 |     partitionsRDD.checkpoint()
 78 |   }
 79 | 
 80 |   override def isCheckpointed: Boolean = {
 81 |     firstParent[ShippableVertexPartition[VD]].isCheckpointed
 82 |   }
 83 | 
 84 |   override def getCheckpointFile: Option[String] = {
 85 |     partitionsRDD.getCheckpointFile
 86 |   }
 87 | 
 88 |   /** The number of vertices in the RDD. */
 89 |   override def count(): Long = {
 90 |     partitionsRDD.map(_.size.toLong).reduce(_ + _)
 91 |   }
 92 | 
 93 |   override private[graphx] def mapVertexPartitions[VD2: ClassTag](
 94 |       f: ShippableVertexPartition[VD] => ShippableVertexPartition[VD2])
 95 |     : VertexRDD[VD2] = {
 96 |     val newPartitionsRDD = partitionsRDD.mapPartitions(_.map(f), preservesPartitioning = true)
 97 |     this.withPartitionsRDD(newPartitionsRDD)
 98 |   }
 99 | 
100 |   override def mapValues[VD2: ClassTag](f: VD => VD2): VertexRDD[VD2] =
101 |     this.mapVertexPartitions(_.map((vid, attr) => f(attr)))
102 | 
103 |   override def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2] =
104 |     this.mapVertexPartitions(_.map(f))
105 | 
106 |   override def minus(other: RDD[(VertexId, VD)]): VertexRDD[VD] = {
107 |     minus(this.aggregateUsingIndex(other, (a: VD, b: VD) => a))
108 |   }
109 | 
110 |   override def minus (other: VertexRDD[VD]): VertexRDD[VD] = {
111 |     other match {
112 |       case other: VertexRDD[_] if this.partitioner == other.partitioner =>
113 |         this.withPartitionsRDD[VD](
114 |           partitionsRDD.zipPartitions(
115 |             other.partitionsRDD, preservesPartitioning = true) {
116 |             (thisIter, otherIter) =>
117 |               val thisPart = thisIter.next()
118 |               val otherPart = otherIter.next()
119 |               Iterator(thisPart.minus(otherPart))
120 |           })
121 |       case _ =>
122 |         this.withPartitionsRDD[VD](
123 |           partitionsRDD.zipPartitions(
124 |             other.partitionBy(this.partitioner.get), preservesPartitioning = true) {
125 |             (partIter, msgs) => partIter.map(_.minus(msgs))
126 |           }
127 |         )
128 |     }
129 |   }
130 | 
131 |   override def diff(other: RDD[(VertexId, VD)]): VertexRDD[VD] = {
132 |     diff(this.aggregateUsingIndex(other, (a: VD, b: VD) => a))
133 |   }
134 | 
135 |   override def diff(other: VertexRDD[VD]): VertexRDD[VD] = {
136 |     val otherPartition = other match {
137 |       case other: VertexRDD[_] if this.partitioner == other.partitioner =>
138 |         other.partitionsRDD
139 |       case _ =>
140 |         VertexRDD(other.partitionBy(this.partitioner.get)).partitionsRDD
141 |     }
142 |     val newPartitionsRDD = partitionsRDD.zipPartitions(
143 |       otherPartition, preservesPartitioning = true
144 |     ) { (thisIter, otherIter) =>
145 |       val thisPart = thisIter.next()
146 |       val otherPart = otherIter.next()
147 |       Iterator(thisPart.diff(otherPart))
148 |     }
149 |     this.withPartitionsRDD(newPartitionsRDD)
150 |   }
151 | 
152 |   override def leftZipJoin[VD2: ClassTag, VD3: ClassTag]
153 |       (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] = {
154 |     val newPartitionsRDD = partitionsRDD.zipPartitions(
155 |       other.partitionsRDD, preservesPartitioning = true
156 |     ) { (thisIter, otherIter) =>
157 |       val thisPart = thisIter.next()
158 |       val otherPart = otherIter.next()
159 |       Iterator(thisPart.leftJoin(otherPart)(f))
160 |     }
161 |     this.withPartitionsRDD(newPartitionsRDD)
162 |   }
163 | 
164 |   override def leftJoin[VD2: ClassTag, VD3: ClassTag]
165 |       (other: RDD[(VertexId, VD2)])
166 |       (f: (VertexId, VD, Option[VD2]) => VD3)
167 |     : VertexRDD[VD3] = {
168 |     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
169 |     // If the other set is a VertexRDD then we use the much more efficient leftZipJoin
170 |     other match {
171 |       case other: VertexRDD[_] if this.partitioner == other.partitioner =>
172 |         leftZipJoin(other)(f)
173 |       case _ =>
174 |         this.withPartitionsRDD[VD3](
175 |           partitionsRDD.zipPartitions(
176 |             other.partitionBy(this.partitioner.get), preservesPartitioning = true) {
177 |             (partIter, msgs) => partIter.map(_.leftJoin(msgs)(f))
178 |           }
179 |         )
180 |     }
181 |   }
182 | 
183 |   override def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U])
184 |       (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = {
185 |     val newPartitionsRDD = partitionsRDD.zipPartitions(
186 |       other.partitionsRDD, preservesPartitioning = true
187 |     ) { (thisIter, otherIter) =>
188 |       val thisPart = thisIter.next()
189 |       val otherPart = otherIter.next()
190 |       Iterator(thisPart.innerJoin(otherPart)(f))
191 |     }
192 |     this.withPartitionsRDD(newPartitionsRDD)
193 |   }
194 | 
195 |   override def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
196 |       (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = {
197 |     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
198 |     // If the other set is a VertexRDD then we use the much more efficient innerZipJoin
199 |     other match {
200 |       case other: VertexRDD[_] if this.partitioner == other.partitioner =>
201 |         innerZipJoin(other)(f)
202 |       case _ =>
203 |         this.withPartitionsRDD(
204 |           partitionsRDD.zipPartitions(
205 |             other.partitionBy(this.partitioner.get), preservesPartitioning = true) {
206 |             (partIter, msgs) => partIter.map(_.innerJoin(msgs)(f))
207 |           }
208 |         )
209 |     }
210 |   }
211 | 
212 |   override def aggregateUsingIndex[VD2: ClassTag](
213 |       messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = {
214 |     val shuffled = messages.partitionBy(this.partitioner.get)
215 |     val parts = partitionsRDD.zipPartitions(shuffled, true) { (thisIter, msgIter) =>
216 |       thisIter.map(_.aggregateUsingIndex(msgIter, reduceFunc))
217 |     }
218 |     this.withPartitionsRDD[VD2](parts)
219 |   }
220 | 
221 |   override def reverseRoutingTables(): VertexRDD[VD] =
222 |     this.mapVertexPartitions(vPart => vPart.withRoutingTable(vPart.routingTable.reverse))
223 | 
224 |   override def withEdges(edges: EdgeRDD[_]): VertexRDD[VD] = {
225 |     val routingTables = VertexRDD.createRoutingTables(edges, this.partitioner.get)
226 |     val vertexPartitions = partitionsRDD.zipPartitions(routingTables, true) {
227 |       (partIter, routingTableIter) =>
228 |         val routingTable =
229 |           if (routingTableIter.hasNext) routingTableIter.next() else RoutingTablePartition.empty
230 |         partIter.map(_.withRoutingTable(routingTable))
231 |     }
232 |     this.withPartitionsRDD(vertexPartitions)
233 |   }
234 | 
235 |   override private[graphx] def withPartitionsRDD[VD2: ClassTag](
236 |       partitionsRDD: RDD[ShippableVertexPartition[VD2]]): VertexRDD[VD2] = {
237 |     new VertexRDDImpl(partitionsRDD, this.targetStorageLevel)
238 |   }
239 | 
240 |   override private[graphx] def withTargetStorageLevel(
241 |       targetStorageLevel: StorageLevel): VertexRDD[VD] = {
242 |     new VertexRDDImpl(this.partitionsRDD, targetStorageLevel)
243 |   }
244 | 
245 |   override private[graphx] def shipVertexAttributes(
246 |       shipSrc: Boolean, shipDst: Boolean): RDD[(PartitionID, VertexAttributeBlock[VD])] = {
247 |     partitionsRDD.mapPartitions(_.flatMap(_.shipVertexAttributes(shipSrc, shipDst)))
248 |   }
249 | 
250 |   override private[graphx] def shipVertexIds(): RDD[(PartitionID, Array[VertexId])] = {
251 |     partitionsRDD.mapPartitions(_.flatMap(_.shipVertexIds()))
252 |   }
253 | 
254 | }
255 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/impl/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | import org.apache.spark.util.collection.OpenHashSet
21 | 
22 | package object impl {
23 |   private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexId]
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.lib
19 | 
20 | import scala.reflect.ClassTag
21 | 
22 | import org.apache.spark.graphx._
23 | 
24 | /** Connected components algorithm. */
25 | object ConnectedComponents {
26 |   /**
27 |    * Compute the connected component membership of each vertex and return a graph with the vertex
28 |    * value containing the lowest vertex id in the connected component containing that vertex.
29 |    *
30 |    * @tparam VD the vertex attribute type (discarded in the computation)
31 |    * @tparam ED the edge attribute type (preserved in the computation)
32 |    *
33 |    * @param graph the graph for which to compute the connected components
34 |    *
35 |    * @return a graph with vertex attributes containing the smallest vertex in each
36 |    *         connected component
37 |    */
38 |   def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexId, ED] = {
39 |     val ccGraph = graph.mapVertices { case (vid, _) => vid }
40 |     def sendMessage(edge: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, VertexId)] = {
41 |       if (edge.srcAttr < edge.dstAttr) {
42 |         Iterator((edge.dstId, edge.srcAttr))
43 |       } else if (edge.srcAttr > edge.dstAttr) {
44 |         Iterator((edge.srcId, edge.dstAttr))
45 |       } else {
46 |         Iterator.empty
47 |       }
48 |     }
49 |     val initialMessage = Long.MaxValue
50 |     Pregel(ccGraph, initialMessage, activeDirection = EdgeDirection.Either)(
51 |       vprog = (id, attr, msg) => math.min(attr, msg),
52 |       sendMsg = sendMessage,
53 |       mergeMsg = (a, b) => math.min(a, b))
54 |   } // end of connectedComponents
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/LabelPropagation.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.lib
19 | 
20 | import scala.reflect.ClassTag
21 | import org.apache.spark.graphx._
22 | 
23 | /** Label Propagation algorithm. */
24 | object LabelPropagation {
25 |   /**
26 |    * Run static Label Propagation for detecting communities in networks.
27 |    *
28 |    * Each node in the network is initially assigned to its own community. At every superstep, nodes
29 |    * send their community affiliation to all neighbors and update their state to the mode community
30 |    * affiliation of incoming messages.
31 |    *
32 |    * LPA is a standard community detection algorithm for graphs. It is very inexpensive
33 |    * computationally, although (1) convergence is not guaranteed and (2) one can end up with
34 |    * trivial solutions (all nodes are identified into a single community).
35 |    *
36 |    * @tparam ED the edge attribute type (not used in the computation)
37 |    *
38 |    * @param graph the graph for which to compute the community affiliation
39 |    * @param maxSteps the number of supersteps of LPA to be performed. Because this is a static
40 |    * implementation, the algorithm will run for exactly this many supersteps.
41 |    *
42 |    * @return a graph with vertex attributes containing the label of community affiliation
43 |    */
44 |   def run[VD, ED: ClassTag](graph: Graph[VD, ED], maxSteps: Int): Graph[VertexId, ED] = {
45 |     val lpaGraph = graph.mapVertices { case (vid, _) => vid }
46 |     def sendMessage(e: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, Map[VertexId, Long])] = {
47 |       Iterator((e.srcId, Map(e.dstAttr -> 1L)), (e.dstId, Map(e.srcAttr -> 1L)))
48 |     }
49 |     def mergeMessage(count1: Map[VertexId, Long], count2: Map[VertexId, Long])
50 |       : Map[VertexId, Long] = {
51 |       (count1.keySet ++ count2.keySet).map { i =>
52 |         val count1Val = count1.getOrElse(i, 0L)
53 |         val count2Val = count2.getOrElse(i, 0L)
54 |         i -> (count1Val + count2Val)
55 |       }.toMap
56 |     }
57 |     def vertexProgram(vid: VertexId, attr: Long, message: Map[VertexId, Long]): VertexId = {
58 |       if (message.isEmpty) attr else message.maxBy(_._2)._1
59 |     }
60 |     val initialMessage = Map[VertexId, Long]()
61 |     Pregel(lpaGraph, initialMessage, maxIterations = maxSteps)(
62 |       vprog = vertexProgram,
63 |       sendMsg = sendMessage,
64 |       mergeMsg = mergeMessage)
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.lib
 19 | 
 20 | import scala.reflect.ClassTag
 21 | import scala.language.postfixOps
 22 | 
 23 | import org.apache.spark.Logging
 24 | import org.apache.spark.graphx._
 25 | 
 26 | /**
 27 |   * 计算一张图中所有顶点的重要程度,进而对他们进行排名.
 28 |   *  这是GraphX提供的用Pregel的模型改进后产生的图算法，通常我们在进行使用PageRank的代码编写时并不涉及去改动这份源码，
 29 |   *  而是直接调用：graph.pageRank(0.0001)
 30 |   */
 31 | 
 32 | /**
 33 |  * PageRank algorithm implementation. There are two implementations of PageRank implemented.
 34 |  *PageRank的两种实现方式.
 35 |  * The first implementation uses the standalone [[Graph]] interface and runs PageRank
 36 |  * for a fixed number of iterations:
 37 |  * 第一种（静态实现）: 使用standalone [[Graph]]接口,在调用时提供一个参数number，用于指定迭代次数，
 38 |   * 即无论结果如何，该算法在迭代number次后停止计算，返回图结果。
 39 |  * {{{
 40 |  * var PR = Array.fill(n)( 1.0 )
 41 |  * val oldPR = Array.fill(n)( 1.0 )
 42 |  * for( iter <- 0 until numIter ) {
 43 |  *   swap(oldPR, PR)
 44 |  *   for( i <- 0 until n ) {
 45 |  *     PR[i] = alpha + (1 - alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
 46 |  *   }
 47 |  * }
 48 |  * }}}
 49 |  *
 50 |  * The second implementation uses the [[Pregel]] interface and runs PageRank until
 51 |  * convergence:
 52 |  * 第二种：（动态）在调用时提供一个参数tol，用于指定前后两次迭代的结果差值应小于tol，
 53 |   * 以达到最终收敛的效果时才停止计算，返回图结果。
 54 |  * {{{
 55 |  * var PR = Array.fill(n)( 1.0 )
 56 |  * val oldPR = Array.fill(n)( 0.0 )
 57 |  * while( max(abs(PR - oldPr)) > tol ) {
 58 |  *   swap(oldPR, PR)
 59 |  *   for( i <- 0 until n if abs(PR[i] - oldPR[i]) > tol ) {
 60 |  *     PR[i] = alpha + (1 - \alpha) * inNbrs[i].map(j => oldPR[j] / outDeg[j]).sum
 61 |  *   }
 62 |  * }
 63 |  * }}}
 64 |  *
 65 |  * `alpha` is the random reset probability (typically 0.15), `inNbrs[i]` is the set of
 66 |  * neighbors which link to `i` and `outDeg[j]` is the out degree of vertex `j`.
 67 |  *
 68 |  * Note that this is not the "normalized" PageRank and as a consequence pages that have no
 69 |  * inlinks will have a PageRank of alpha.
 70 |  */
 71 | object PageRank extends Logging {
 72 | 
 73 | 
 74 |   /**
 75 |    * Run PageRank for a fixed number of iterations returning a graph
 76 |    * with vertex attributes containing the PageRank and edge
 77 |    * attributes the normalized edge weight.
 78 |     * 以固定的迭代次数运行PageRank算法,以图的形式返回,包括最终的顶点值(pagerank值)
 79 |     * 和标准化的边值(权重值),进而得到最终的排名结果.
 80 |    * @tparam VD the original vertex attribute (not used)
 81 |     *            顶点的属性类型(不需要用户指定,由你传入的图的属性决定)
 82 |    * @tparam ED the original edge attribute (not used)
 83 |    *              边的属性类型(不需要用户指定,由你传入的图的属性决定)
 84 |    * @param graph the graph on which to compute PageRank
 85 |     *              进行PageRank计算的图模型
 86 |    * @param numIter the number of iterations of PageRank to run
 87 |     *                迭代次数
 88 |    * @param resetProb the random reset probability (alpha)
 89 |     *                  随机重置的概率,通常都是0.15
 90 |    * @return the graph containing with each vertex containing the PageRank and each edge
 91 |    *         containing the normalized weight.
 92 |     *         以图的形式返回,包括最终的顶点值(pagerank值)和标准化的边值(权重值),
 93 |     *         进而得到最终的排名结果
 94 |    */
 95 |   def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int,
 96 |     resetProb: Double = 0.15): Graph[Double, Double] =
 97 |   {
 98 |     runWithOptions(graph, numIter, resetProb)
 99 |   }
100 | 
101 |   /**
102 |    * Run PageRank for a fixed number of iterations returning a graph
103 |    * with vertex attributes containing the PageRank and edge
104 |    * attributes the normalized edge weight.
105 |    *
106 |     * withOptions : 可以个性化定义srcId
107 |     *
108 |    * @tparam VD the original vertex attribute (not used)
109 |    * @tparam ED the original edge attribute (not used)
110 |    *
111 |    * @param graph the graph on which to compute PageRank
112 |     *              进行PageRank计算的图模型
113 |    * @param numIter the number of iterations of PageRank to run
114 |     *                迭代次数
115 |    * @param resetProb the random reset probability (alpha)
116 |     *                  随机重置的概率,通常都是0.15
117 |    * @param srcId the source vertex for a Personalized Page Rank (optional)
118 |     *              个性化的顶点Id值
119 |    * @return the graph containing with each vertex containing the PageRank and each edge
120 |    *         containing the normalized weight.
121 |     *         以图的形式返回,包括最终的顶点值(pagerank值)和标准化的边值(权重值)
122 |    */
123 |   def runWithOptions[VD: ClassTag, ED: ClassTag](
124 |       graph: Graph[VD, ED], numIter: Int, resetProb: Double = 0.15,
125 |       srcId: Option[VertexId] = None): Graph[Double, Double] =
126 |   {
127 |     //srcId是否被定义,即是否个性化,返回Boolean值
128 |     val personalized = srcId isDefined
129 |     //获取srcId的具体值,如果为空,则赋值为-1L
130 |     val src: VertexId = srcId.getOrElse(-1L)
131 | 
132 |     // Initialize the PageRank graph with each edge attribute having
133 |     // weight 1/outDegree and each vertex with attribute resetProb.
134 |     // 下面这段代码用于初始化PageRank图模型rankGraph，具体内容是
135 |     // 赋予每条边属性为值“1/该边的出度数”,赋予每个顶点属性为resetProb的值。
136 |     // When running personalized pagerank, only the source vertex
137 |     // has an attribute resetProb. All others are set to 0.
138 |     // 当运行personlized PageRank时,仅仅是出发顶点使用resetProb作为属性,
139 |     // 其他所有顶点的属性被设置为0.
140 |     var rankGraph: Graph[Double, Double] = graph
141 |       // Associate the degree with each vertex
142 |       // 将出度数与每个顶点的属性值关联(给每个顶点添加出度这个属性)
143 |       .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) }
144 |       // Set the weight on the edges based on the degree
145 |       //基于度数为边设置权重
146 |       .mapTriplets( e => 1.0 / e.srcAttr, TripletFields.Src )
147 |       // Set the vertex attributes to the initial pagerank values
148 |       //设置每个顶点的初始属性为initial pagerank values
149 |       .mapVertices { (id, attr) =>
150 |         if (!(id != src && personalized)) resetProb else 0.0
151 |       }
152 | 
153 |     def delta(u: VertexId, v: VertexId): Double = { if (u == v) 1.0 else 0.0 }
154 | 
155 |     var iteration = 0
156 |     var prevRankGraph: Graph[Double, Double] = null
157 |     while (iteration < numIter) {
158 |       rankGraph.cache()
159 | 
160 |       // Compute the outgoing rank contributions of each vertex, perform local preaggregation, and
161 |       // do the final aggregation at the receiving vertices. Requires a shuffle for aggregation.
162 |       val rankUpdates = rankGraph.aggregateMessages[Double](
163 |         ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src)
164 | 
165 |       // Apply the final rank updates to get the new ranks, using join to preserve ranks of vertices
166 |       // that didn't receive a message. Requires a shuffle for broadcasting updated ranks to the
167 |       // edge partitions.
168 |       prevRankGraph = rankGraph
169 |       val rPrb = if (personalized) {
170 |         (src: VertexId , id: VertexId) => resetProb * delta(src, id)
171 |       } else {
172 |         (src: VertexId, id: VertexId) => resetProb
173 |       }
174 | 
175 |       rankGraph = rankGraph.joinVertices(rankUpdates) {
176 |         (id, oldRank, msgSum) => rPrb(src, id) + (1.0 - resetProb) * msgSum
177 |       }.cache()
178 | 
179 |       rankGraph.edges.foreachPartition(x => {}) // also materializes rankGraph.vertices
180 |       logInfo(s"PageRank finished iteration $iteration.")
181 |       prevRankGraph.vertices.unpersist(false)
182 |       prevRankGraph.edges.unpersist(false)
183 | 
184 |       iteration += 1
185 |     }
186 | 
187 |     rankGraph
188 |   }
189 | 
190 |   /**
191 |    * Run a dynamic version of PageRank returning a graph with vertex attributes containing the
192 |    * PageRank and edge attributes containing the normalized edge weight.
193 |    *
194 |    * @tparam VD the original vertex attribute (not used)
195 |    * @tparam ED the original edge attribute (not used)
196 |    *
197 |    * @param graph the graph on which to compute PageRank
198 |    * @param tol the tolerance allowed at convergence (smaller => more accurate).
199 |    * @param resetProb the random reset probability (alpha)
200 |    *
201 |    * @return the graph containing with each vertex containing the PageRank and each edge
202 |    *         containing the normalized weight.
203 |    */
204 |   def runUntilConvergence[VD: ClassTag, ED: ClassTag](
205 |     graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15): Graph[Double, Double] =
206 |   {
207 |       runUntilConvergenceWithOptions(graph, tol, resetProb)
208 |   }
209 | 
210 |   /**
211 |    * Run a dynamic version of PageRank returning a graph with vertex attributes containing the
212 |    * PageRank and edge attributes containing the normalized edge weight.
213 |    *
214 |    * @tparam VD the original vertex attribute (not used)
215 |    * @tparam ED the original edge attribute (not used)
216 |    *
217 |    * @param graph the graph on which to compute PageRank
218 |    * @param tol the tolerance allowed at convergence (smaller => more accurate).
219 |    * @param resetProb the random reset probability (alpha)
220 |    * @param srcId the source vertex for a Personalized Page Rank (optional)
221 |    *
222 |    * @return the graph containing with each vertex containing the PageRank and each edge
223 |    *         containing the normalized weight.
224 |    */
225 |   def runUntilConvergenceWithOptions[VD: ClassTag, ED: ClassTag](
226 |       graph: Graph[VD, ED], tol: Double, resetProb: Double = 0.15,
227 |       srcId: Option[VertexId] = None): Graph[Double, Double] =
228 |   {
229 |     val personalized = srcId.isDefined
230 |     val src: VertexId = srcId.getOrElse(-1L)
231 | 
232 |     // Initialize the pagerankGraph with each edge attribute
233 |     // having weight 1/outDegree and each vertex with attribute 1.0.
234 |     val pagerankGraph: Graph[(Double, Double), Double] = graph
235 |       // Associate the degree with each vertex
236 |       .outerJoinVertices(graph.outDegrees) {
237 |         (vid, vdata, deg) => deg.getOrElse(0)
238 |       }
239 |       // Set the weight on the edges based on the degree
240 |       .mapTriplets( e => 1.0 / e.srcAttr )
241 |       // Set the vertex attributes to (initalPR, delta = 0)
242 |       .mapVertices { (id, attr) =>
243 |         if (id == src) (resetProb, Double.NegativeInfinity) else (0.0, 0.0)
244 |       }
245 |       .cache()
246 | 
247 |     // Define the three functions needed to implement PageRank in the GraphX
248 |     // 以下将定义三个所需函数来完成GraphX对PageRank的算法实现想
249 |     // version of Pregel
250 |     // 第一个函数用于返回一个考虑“随机事件”发生后的计算结果
251 |     def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {
252 |       val (oldPR, lastDelta) = attr
253 |       val newPR = oldPR + (1.0 - resetProb) * msgSum
254 |       (newPR, newPR - oldPR)
255 |     }
256 |     //
257 |     def personalizedVertexProgram(id: VertexId, attr: (Double, Double),
258 |       msgSum: Double): (Double, Double) = {
259 |       val (oldPR, lastDelta) = attr
260 |       var teleport = oldPR
261 |       val delta = if (src==id) 1.0 else 0.0
262 |       teleport = oldPR*delta
263 | 
264 |       val newPR = teleport + (1.0 - resetProb) * msgSum
265 |       val newDelta = if (lastDelta == Double.NegativeInfinity) newPR else newPR - oldPR
266 |       (newPR, newDelta)
267 |     }
268 |     // 第二个函数用于得到一个迭代器，里面包含了两个信息：该边的目的ID、
269 |     // 该边的源属性值和权重的乘积（该边传递的实际PR值）
270 |     def sendMessage(edge: EdgeTriplet[(Double, Double), Double]) = {
271 |       if (edge.srcAttr._2 > tol) {
272 |         Iterator((edge.dstId, edge.srcAttr._2 * edge.attr))
273 |       } else {
274 |         Iterator.empty
275 |       }
276 |     }
277 |     // 第三个函数用于将顶点属性值和传递的值进行累加
278 |     def messageCombiner(a: Double, b: Double): Double = a + b
279 | 
280 |     // The initial message received by all vertices in PageRank
281 |     // 所有顶点接收到的初始信息
282 |     val initialMessage = if (personalized) 0.0 else resetProb / (1.0 - resetProb)
283 | 
284 |     // Execute a dynamic version of Pregel.
285 |     val vp = if (personalized) {
286 |       (id: VertexId, attr: (Double, Double), msgSum: Double) =>
287 |         personalizedVertexProgram(id, attr, msgSum)
288 |     } else {
289 |       (id: VertexId, attr: (Double, Double), msgSum: Double) =>
290 |         vertexProgram(id, attr, msgSum)
291 |     }
292 | 
293 |     Pregel(pagerankGraph, initialMessage, activeDirection = EdgeDirection.Out)(
294 |       vp, sendMessage, messageCombiner)
295 |       .mapVertices((vid, attr) => attr._1)
296 |   } // end of deltaPageRank
297 | 
298 | }
299 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.lib
 19 | 
 20 | import scala.util.Random
 21 | 
 22 | import com.github.fommil.netlib.BLAS.{getInstance => blas}
 23 | 
 24 | import org.apache.spark.rdd._
 25 | import org.apache.spark.graphx._
 26 | 
 27 | /** Implementation of SVD++ algorithm. */
 28 | object SVDPlusPlus {
 29 | 
 30 |   /** Configuration parameters for SVDPlusPlus. */
 31 |   class Conf(
 32 |       var rank: Int,
 33 |       var maxIters: Int,
 34 |       var minVal: Double,
 35 |       var maxVal: Double,
 36 |       var gamma1: Double,
 37 |       var gamma2: Double,
 38 |       var gamma6: Double,
 39 |       var gamma7: Double)
 40 |     extends Serializable
 41 | 
 42 |   /**
 43 |    * This method is now replaced by the updated version of `run()` and returns exactly
 44 |    * the same result.
 45 |    */
 46 |   @deprecated("Call run()", "1.4.0")
 47 |   def runSVDPlusPlus(edges: RDD[Edge[Double]], conf: Conf)
 48 |     : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
 49 |   {
 50 |     run(edges, conf)
 51 |   }
 52 | 
 53 |   /**
 54 |    * Implement SVD++ based on "Factorization Meets the Neighborhood:
 55 |    * a Multifaceted Collaborative Filtering Model",
 56 |    * available at [[http://public.research.att.com/~volinsky/netflix/kdd08koren.pdf]].
 57 |    *
 58 |    * The prediction rule is rui = u + bu + bi + qi*(pu + |N(u)|^^-0.5^^*sum(y)),
 59 |    * see the details on page 6.
 60 |    *
 61 |    * @param edges edges for constructing the graph
 62 |    *
 63 |    * @param conf SVDPlusPlus parameters
 64 |    *
 65 |    * @return a graph with vertex attributes containing the trained model
 66 |    */
 67 |   def run(edges: RDD[Edge[Double]], conf: Conf)
 68 |     : (Graph[(Array[Double], Array[Double], Double, Double), Double], Double) =
 69 |   {
 70 |     // Generate default vertex attribute
 71 |     def defaultF(rank: Int): (Array[Double], Array[Double], Double, Double) = {
 72 |       // TODO: use a fixed random seed
 73 |       val v1 = Array.fill(rank)(Random.nextDouble())
 74 |       val v2 = Array.fill(rank)(Random.nextDouble())
 75 |       (v1, v2, 0.0, 0.0)
 76 |     }
 77 | 
 78 |     // calculate global rating mean
 79 |     edges.cache()
 80 |     val (rs, rc) = edges.map(e => (e.attr, 1L)).reduce((a, b) => (a._1 + b._1, a._2 + b._2))
 81 |     val u = rs / rc
 82 | 
 83 |     // construct graph
 84 |     var g = Graph.fromEdges(edges, defaultF(conf.rank)).cache()
 85 |     materialize(g)
 86 |     edges.unpersist()
 87 | 
 88 |     // Calculate initial bias and norm
 89 |     val t0 = g.aggregateMessages[(Long, Double)](
 90 |       ctx => { ctx.sendToSrc((1L, ctx.attr)); ctx.sendToDst((1L, ctx.attr)) },
 91 |       (g1, g2) => (g1._1 + g2._1, g1._2 + g2._2))
 92 | 
 93 |     val gJoinT0 = g.outerJoinVertices(t0) {
 94 |       (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double),
 95 |        msg: Option[(Long, Double)]) =>
 96 |         (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1))
 97 |     }.cache()
 98 |     materialize(gJoinT0)
 99 |     g.unpersist()
100 |     g = gJoinT0
101 | 
102 |     def sendMsgTrainF(conf: Conf, u: Double)
103 |         (ctx: EdgeContext[
104 |           (Array[Double], Array[Double], Double, Double),
105 |           Double,
106 |           (Array[Double], Array[Double], Double)]) {
107 |       val (usr, itm) = (ctx.srcAttr, ctx.dstAttr)
108 |       val (p, q) = (usr._1, itm._1)
109 |       val rank = p.length
110 |       var pred = u + usr._3 + itm._3 + blas.ddot(rank, q, 1, usr._2, 1)
111 |       pred = math.max(pred, conf.minVal)
112 |       pred = math.min(pred, conf.maxVal)
113 |       val err = ctx.attr - pred
114 |       // updateP = (err * q - conf.gamma7 * p) * conf.gamma2
115 |       val updateP = q.clone()
116 |       blas.dscal(rank, err * conf.gamma2, updateP, 1)
117 |       blas.daxpy(rank, -conf.gamma7 * conf.gamma2, p, 1, updateP, 1)
118 |       // updateQ = (err * usr._2 - conf.gamma7 * q) * conf.gamma2
119 |       val updateQ = usr._2.clone()
120 |       blas.dscal(rank, err * conf.gamma2, updateQ, 1)
121 |       blas.daxpy(rank, -conf.gamma7 * conf.gamma2, q, 1, updateQ, 1)
122 |       // updateY = (err * usr._4 * q - conf.gamma7 * itm._2) * conf.gamma2
123 |       val updateY = q.clone()
124 |       blas.dscal(rank, err * usr._4 * conf.gamma2, updateY, 1)
125 |       blas.daxpy(rank, -conf.gamma7 * conf.gamma2, itm._2, 1, updateY, 1)
126 |       ctx.sendToSrc((updateP, updateY, (err - conf.gamma6 * usr._3) * conf.gamma1))
127 |       ctx.sendToDst((updateQ, updateY, (err - conf.gamma6 * itm._3) * conf.gamma1))
128 |     }
129 | 
130 |     for (i <- 0 until conf.maxIters) {
131 |       // Phase 1, calculate pu + |N(u)|^(-0.5)*sum(y) for user nodes
132 |       g.cache()
133 |       val t1 = g.aggregateMessages[Array[Double]](
134 |         ctx => ctx.sendToSrc(ctx.dstAttr._2),
135 |         (g1, g2) => {
136 |           val out = g1.clone()
137 |           blas.daxpy(out.length, 1.0, g2, 1, out, 1)
138 |           out
139 |         })
140 |       val gJoinT1 = g.outerJoinVertices(t1) {
141 |         (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double),
142 |          msg: Option[Array[Double]]) =>
143 |           if (msg.isDefined) {
144 |             val out = vd._1.clone()
145 |             blas.daxpy(out.length, vd._4, msg.get, 1, out, 1)
146 |             (vd._1, out, vd._3, vd._4)
147 |           } else {
148 |             vd
149 |           }
150 |       }.cache()
151 |       materialize(gJoinT1)
152 |       g.unpersist()
153 |       g = gJoinT1
154 | 
155 |       // Phase 2, update p for user nodes and q, y for item nodes
156 |       g.cache()
157 |       val t2 = g.aggregateMessages(
158 |         sendMsgTrainF(conf, u),
159 |         (g1: (Array[Double], Array[Double], Double), g2: (Array[Double], Array[Double], Double)) =>
160 |         {
161 |           val out1 = g1._1.clone()
162 |           blas.daxpy(out1.length, 1.0, g2._1, 1, out1, 1)
163 |           val out2 = g2._2.clone()
164 |           blas.daxpy(out2.length, 1.0, g2._2, 1, out2, 1)
165 |           (out1, out2, g1._3 + g2._3)
166 |         })
167 |       val gJoinT2 = g.outerJoinVertices(t2) {
168 |         (vid: VertexId,
169 |          vd: (Array[Double], Array[Double], Double, Double),
170 |          msg: Option[(Array[Double], Array[Double], Double)]) => {
171 |           val out1 = vd._1.clone()
172 |           blas.daxpy(out1.length, 1.0, msg.get._1, 1, out1, 1)
173 |           val out2 = vd._2.clone()
174 |           blas.daxpy(out2.length, 1.0, msg.get._2, 1, out2, 1)
175 |           (out1, out2, vd._3 + msg.get._3, vd._4)
176 |         }
177 |       }.cache()
178 |       materialize(gJoinT2)
179 |       g.unpersist()
180 |       g = gJoinT2
181 |     }
182 | 
183 |     // calculate error on training set
184 |     def sendMsgTestF(conf: Conf, u: Double)
185 |         (ctx: EdgeContext[(Array[Double], Array[Double], Double, Double), Double, Double]) {
186 |       val (usr, itm) = (ctx.srcAttr, ctx.dstAttr)
187 |       val (p, q) = (usr._1, itm._1)
188 |       var pred = u + usr._3 + itm._3 + blas.ddot(q.length, q, 1, usr._2, 1)
189 |       pred = math.max(pred, conf.minVal)
190 |       pred = math.min(pred, conf.maxVal)
191 |       val err = (ctx.attr - pred) * (ctx.attr - pred)
192 |       ctx.sendToDst(err)
193 |     }
194 | 
195 |     g.cache()
196 |     val t3 = g.aggregateMessages[Double](sendMsgTestF(conf, u), _ + _)
197 |     val gJoinT3 = g.outerJoinVertices(t3) {
198 |       (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double), msg: Option[Double]) =>
199 |         if (msg.isDefined) (vd._1, vd._2, vd._3, msg.get) else vd
200 |     }.cache()
201 |     materialize(gJoinT3)
202 |     g.unpersist()
203 |     g = gJoinT3
204 | 
205 |     // Convert DoubleMatrix to Array[Double]:
206 |     val newVertices = g.vertices.mapValues(v => (v._1.toArray, v._2.toArray, v._3, v._4))
207 |     (Graph(newVertices, g.edges), u)
208 |   }
209 | 
210 |   /**
211 |    * Forces materialization of a Graph by count()ing its RDDs.
212 |    */
213 |   private def materialize(g: Graph[_, _]): Unit = {
214 |     g.vertices.count()
215 |     g.edges.count()
216 |   }
217 | 
218 | }
219 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/ShortestPaths.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.lib
19 | 
20 | import org.apache.spark.graphx._
21 | import scala.reflect.ClassTag
22 | 
23 | /**
24 |  * Computes shortest paths to the given set of landmark vertices, returning a graph where each
25 |  * vertex attribute is a map containing the shortest-path distance to each reachable landmark.
26 |  */
27 | object ShortestPaths {
28 |   /** Stores a map from the vertex id of a landmark to the distance to that landmark. */
29 |   type SPMap = Map[VertexId, Int]
30 | 
31 |   private def makeMap(x: (VertexId, Int)*) = Map(x: _*)
32 | 
33 |   private def incrementMap(spmap: SPMap): SPMap = spmap.map { case (v, d) => v -> (d + 1) }
34 | 
35 |   private def addMaps(spmap1: SPMap, spmap2: SPMap): SPMap =
36 |     (spmap1.keySet ++ spmap2.keySet).map {
37 |       k => k -> math.min(spmap1.getOrElse(k, Int.MaxValue), spmap2.getOrElse(k, Int.MaxValue))
38 |     }.toMap
39 | 
40 |   /**
41 |    * Computes shortest paths to the given set of landmark vertices.
42 |    *
43 |    * @tparam ED the edge attribute type (not used in the computation)
44 |    *
45 |    * @param graph the graph for which to compute the shortest paths
46 |    * @param landmarks the list of landmark vertex ids. Shortest paths will be computed to each
47 |    * landmark.
48 |    *
49 |    * @return a graph where each vertex attribute is a map containing the shortest-path distance to
50 |    * each reachable landmark vertex.
51 |    */
52 |   def run[VD, ED: ClassTag](graph: Graph[VD, ED], landmarks: Seq[VertexId]): Graph[SPMap, ED] = {
53 |     val spGraph = graph.mapVertices { (vid, attr) =>
54 |       if (landmarks.contains(vid)) makeMap(vid -> 0) else makeMap()
55 |     }
56 | 
57 |     val initialMessage = makeMap()
58 | 
59 |     def vertexProgram(id: VertexId, attr: SPMap, msg: SPMap): SPMap = {
60 |       addMaps(attr, msg)
61 |     }
62 | 
63 |     def sendMessage(edge: EdgeTriplet[SPMap, _]): Iterator[(VertexId, SPMap)] = {
64 |       val newAttr = incrementMap(edge.dstAttr)
65 |       if (edge.srcAttr != addMaps(newAttr, edge.srcAttr)) Iterator((edge.srcId, newAttr))
66 |       else Iterator.empty
67 |     }
68 | 
69 |     Pregel(spGraph, initialMessage)(vertexProgram, sendMessage, addMaps)
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.lib
 19 | 
 20 | import scala.reflect.ClassTag
 21 | 
 22 | import org.apache.spark.graphx._
 23 | 
 24 | /** Strongly connected components algorithm implementation. */
 25 | object StronglyConnectedComponents {
 26 | 
 27 |   /**
 28 |    * Compute the strongly connected component (SCC) of each vertex and return a graph with the
 29 |    * vertex value containing the lowest vertex id in the SCC containing that vertex.
 30 |    *
 31 |    * @tparam VD the vertex attribute type (discarded in the computation)
 32 |    * @tparam ED the edge attribute type (preserved in the computation)
 33 |    *
 34 |    * @param graph the graph for which to compute the SCC
 35 |    *
 36 |    * @return a graph with vertex attributes containing the smallest vertex id in each SCC
 37 |    */
 38 |   def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexId, ED] = {
 39 | 
 40 |     // the graph we update with final SCC ids, and the graph we return at the end
 41 |     var sccGraph = graph.mapVertices { case (vid, _) => vid }
 42 |     // graph we are going to work with in our iterations
 43 |     var sccWorkGraph = graph.mapVertices { case (vid, _) => (vid, false) }.cache()
 44 | 
 45 |     var numVertices = sccWorkGraph.numVertices
 46 |     var iter = 0
 47 |     while (sccWorkGraph.numVertices > 0 && iter < numIter) {
 48 |       iter += 1
 49 |       do {
 50 |         numVertices = sccWorkGraph.numVertices
 51 |         sccWorkGraph = sccWorkGraph.outerJoinVertices(sccWorkGraph.outDegrees) {
 52 |           (vid, data, degreeOpt) => if (degreeOpt.isDefined) data else (vid, true)
 53 |         }.outerJoinVertices(sccWorkGraph.inDegrees) {
 54 |           (vid, data, degreeOpt) => if (degreeOpt.isDefined) data else (vid, true)
 55 |         }.cache()
 56 | 
 57 |         // get all vertices to be removed
 58 |         val finalVertices = sccWorkGraph.vertices
 59 |             .filter { case (vid, (scc, isFinal)) => isFinal}
 60 |             .mapValues { (vid, data) => data._1}
 61 | 
 62 |         // write values to sccGraph
 63 |         sccGraph = sccGraph.outerJoinVertices(finalVertices) {
 64 |           (vid, scc, opt) => opt.getOrElse(scc)
 65 |         }
 66 |         // only keep vertices that are not final
 67 |         sccWorkGraph = sccWorkGraph.subgraph(vpred = (vid, data) => !data._2).cache()
 68 |       } while (sccWorkGraph.numVertices < numVertices)
 69 | 
 70 |       sccWorkGraph = sccWorkGraph.mapVertices{ case (vid, (color, isFinal)) => (vid, isFinal) }
 71 | 
 72 |       // collect min of all my neighbor's scc values, update if it's smaller than mine
 73 |       // then notify any neighbors with scc values larger than mine
 74 |       sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId](
 75 |         sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)(
 76 |         (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2),
 77 |         e => {
 78 |           if (e.srcAttr._1 < e.dstAttr._1) {
 79 |             Iterator((e.dstId, e.srcAttr._1))
 80 |           } else {
 81 |             Iterator()
 82 |           }
 83 |         },
 84 |         (vid1, vid2) => math.min(vid1, vid2))
 85 | 
 86 |       // start at root of SCCs. Traverse values in reverse, notify all my neighbors
 87 |       // do not propagate if colors do not match!
 88 |       sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean](
 89 |         sccWorkGraph, false, activeDirection = EdgeDirection.In)(
 90 |         // vertex is final if it is the root of a color
 91 |         // or it has the same color as a neighbor that is final
 92 |         (vid, myScc, existsSameColorFinalNeighbor) => {
 93 |           val isColorRoot = vid == myScc._1
 94 |           (myScc._1, myScc._2 || isColorRoot || existsSameColorFinalNeighbor)
 95 |         },
 96 |         // activate neighbor if they are not final, you are, and you have the same color
 97 |         e => {
 98 |           val sameColor = e.dstAttr._1 == e.srcAttr._1
 99 |           val onlyDstIsFinal = e.dstAttr._2 && !e.srcAttr._2
100 |           if (sameColor && onlyDstIsFinal) {
101 |             Iterator((e.srcId, e.dstAttr._2))
102 |           } else {
103 |             Iterator()
104 |           }
105 |         },
106 |         (final1, final2) => final1 || final2)
107 |     }
108 |     sccGraph
109 |   }
110 | 
111 | }
112 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx.lib
19 | 
20 | import scala.reflect.ClassTag
21 | 
22 | import org.apache.spark.graphx._
23 | 
24 | /**
25 |  * Compute the number of triangles passing through each vertex.
26 |  *
27 |  * The algorithm is relatively straightforward and can be computed in three steps:
28 |  *
29 |  * <ul>
30 |  * <li>Compute the set of neighbors for each vertex
31 |  * <li>For each edge compute the intersection of the sets and send the count to both vertices.
32 |  * <li> Compute the sum at each vertex and divide by two since each triangle is counted twice.
33 |  * </ul>
34 |  *
35 |  * Note that the input graph should have its edges in canonical direction
36 |  * (i.e. the `sourceId` less than `destId`). Also the graph must have been partitioned
37 |  * using [[org.apache.spark.graphx.Graph#partitionBy]].
38 |  */
39 | object TriangleCount {
40 | 
41 |   def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[Int, ED] = {
42 |     // Remove redundant edges
43 |     val g = graph.groupEdges((a, b) => a).cache()
44 | 
45 |     // Construct set representations of the neighborhoods
46 |     val nbrSets: VertexRDD[VertexSet] =
47 |       g.collectNeighborIds(EdgeDirection.Either).mapValues { (vid, nbrs) =>
48 |         val set = new VertexSet(4)
49 |         var i = 0
50 |         while (i < nbrs.size) {
51 |           // prevent self cycle
52 |           if (nbrs(i) != vid) {
53 |             set.add(nbrs(i))
54 |           }
55 |           i += 1
56 |         }
57 |         set
58 |       }
59 |     // join the sets with the graph
60 |     val setGraph: Graph[VertexSet, ED] = g.outerJoinVertices(nbrSets) {
61 |       (vid, _, optSet) => optSet.getOrElse(null)
62 |     }
63 |     // Edge function computes intersection of smaller vertex with larger vertex
64 |     def edgeFunc(ctx: EdgeContext[VertexSet, ED, Int]) {
65 |       assert(ctx.srcAttr != null)
66 |       assert(ctx.dstAttr != null)
67 |       val (smallSet, largeSet) = if (ctx.srcAttr.size < ctx.dstAttr.size) {
68 |         (ctx.srcAttr, ctx.dstAttr)
69 |       } else {
70 |         (ctx.dstAttr, ctx.srcAttr)
71 |       }
72 |       val iter = smallSet.iterator
73 |       var counter: Int = 0
74 |       while (iter.hasNext) {
75 |         val vid = iter.next()
76 |         if (vid != ctx.srcId && vid != ctx.dstId && largeSet.contains(vid)) {
77 |           counter += 1
78 |         }
79 |       }
80 |       ctx.sendToSrc(counter)
81 |       ctx.sendToDst(counter)
82 |     }
83 |     // compute the intersection along edges
84 |     val counters: VertexRDD[Int] = setGraph.aggregateMessages(edgeFunc, _ + _)
85 |     // Merge counters with the graph and divide by two since each triangle is counted twice
86 |     g.outerJoinVertices(counters) {
87 |       (vid, _, optCounter: Option[Int]) =>
88 |         val dblCount = optCounter.getOrElse(0)
89 |         // double count should be even (divisible by two)
90 |         assert((dblCount & 1) == 0)
91 |         dblCount / 2
92 |     }
93 |   } // end of TriangleCount
94 | }
95 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | /**
19 |  * Various analytics functions for graphs.
20 |  */
21 | package org.apache.spark.graphx.lib;
22 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/lib/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | /**
21 |  * Various analytics functions for graphs.
22 |  */
23 | package object lib
24 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | /**
19 |  * ALPHA COMPONENT
20 |  * GraphX is a graph processing framework built on top of Spark.
21 |  */
22 | package org.apache.spark.graphx;


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark
19 | 
20 | import org.apache.spark.util.collection.OpenHashSet
21 | 
22 | /**
23 |  * <span class="badge" style="float: right;">ALPHA COMPONENT</span>
24 |  * GraphX is a graph processing framework built on top of Spark.
25 |  */
26 | package object graphx {
27 |   /**
28 |    * A 64-bit vertex identifier that uniquely identifies a vertex within a graph. It does not need
29 |    * to follow any ordering or any constraints other than uniqueness.
30 |    */
31 |   type VertexId = Long
32 | 
33 |   /** Integer identifer of a graph partition. Must be less than 2^30. */
34 |   // TODO: Consider using Char.
35 |   type PartitionID = Int
36 | 
37 |   private[graphx] type VertexSet = OpenHashSet[VertexId]
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.util
 19 | 
 20 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
 21 | 
 22 | import scala.collection.mutable.HashSet
 23 | import scala.language.existentials
 24 | 
 25 | import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor}
 26 | import org.apache.xbean.asm5.Opcodes._
 27 | 
 28 | import org.apache.spark.util.Utils
 29 | 
 30 | /**
 31 |  * Includes an utility function to test whether a function accesses a specific attribute
 32 |  * of an object.
 33 |  */
 34 | private[graphx] object BytecodeUtils {
 35 | 
 36 |   /**
 37 |    * Test whether the given closure invokes the specified method in the specified class.
 38 |    */
 39 |   def invokedMethod(closure: AnyRef, targetClass: Class[_], targetMethod: String): Boolean = {
 40 |     if (_invokedMethod(closure.getClass, "apply", targetClass, targetMethod)) {
 41 |       true
 42 |     } else {
 43 |       // look at closures enclosed in this closure
 44 |       for (f <- closure.getClass.getDeclaredFields
 45 |            if f.getType.getName.startsWith("scala.Function")) {
 46 |         f.setAccessible(true)
 47 |         if (invokedMethod(f.get(closure), targetClass, targetMethod)) {
 48 |           return true
 49 |         }
 50 |       }
 51 |       return false
 52 |     }
 53 |   }
 54 | 
 55 |   private def _invokedMethod(cls: Class[_], method: String,
 56 |       targetClass: Class[_], targetMethod: String): Boolean = {
 57 | 
 58 |     val seen = new HashSet[(Class[_], String)]
 59 |     var stack = List[(Class[_], String)]((cls, method))
 60 | 
 61 |     while (stack.nonEmpty) {
 62 |       val (c, m) = stack.head
 63 |       stack = stack.tail
 64 |       seen.add((c, m))
 65 |       val finder = new MethodInvocationFinder(c.getName, m)
 66 |       getClassReader(c).accept(finder, 0)
 67 |       for (classMethod <- finder.methodsInvoked) {
 68 |         if (classMethod._1 == targetClass && classMethod._2 == targetMethod) {
 69 |           return true
 70 |         } else if (!seen.contains(classMethod)) {
 71 |           stack = classMethod :: stack
 72 |         }
 73 |       }
 74 |     }
 75 |     return false
 76 |   }
 77 | 
 78 |   /**
 79 |    * Get an ASM class reader for a given class from the JAR that loaded it.
 80 |    */
 81 |   private def getClassReader(cls: Class[_]): ClassReader = {
 82 |     // Copy data over, before delegating to ClassReader - else we can run out of open file handles.
 83 |     val className = cls.getName.replaceFirst("^.*\\.", "") + ".class"
 84 |     val resourceStream = cls.getResourceAsStream(className)
 85 |     // todo: Fixme - continuing with earlier behavior ...
 86 |     if (resourceStream == null) return new ClassReader(resourceStream)
 87 | 
 88 |     val baos = new ByteArrayOutputStream(128)
 89 |     Utils.copyStream(resourceStream, baos, true)
 90 |     new ClassReader(new ByteArrayInputStream(baos.toByteArray))
 91 |   }
 92 | 
 93 |   /**
 94 |    * Given the class name, return whether we should look into the class or not. This is used to
 95 |    * skip examing a large quantity of Java or Scala classes that we know for sure wouldn't access
 96 |    * the closures. Note that the class name is expected in ASM style (i.e. use "/" instead of ".").
 97 |    */
 98 |   private def skipClass(className: String): Boolean = {
 99 |     val c = className
100 |     c.startsWith("java/") || c.startsWith("scala/") || c.startsWith("javax/")
101 |   }
102 | 
103 |   /**
104 |    * Find the set of methods invoked by the specified method in the specified class.
105 |    * For example, after running the visitor,
106 |    *   MethodInvocationFinder("spark/graph/Foo", "test")
107 |    * its methodsInvoked variable will contain the set of methods invoked directly by
108 |    * Foo.test(). Interface invocations are not returned as part of the result set because we cannot
109 |    * determine the actual method invoked by inspecting the bytecode.
110 |    */
111 |   private class MethodInvocationFinder(className: String, methodName: String)
112 |     extends ClassVisitor(ASM5) {
113 | 
114 |     val methodsInvoked = new HashSet[(Class[_], String)]
115 | 
116 |     override def visitMethod(access: Int, name: String, desc: String,
117 |                              sig: String, exceptions: Array[String]): MethodVisitor = {
118 |       if (name == methodName) {
119 |         new MethodVisitor(ASM5) {
120 |           override def visitMethodInsn(
121 |               op: Int, owner: String, name: String, desc: String, itf: Boolean) {
122 |             if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) {
123 |               if (!skipClass(owner)) {
124 |                 methodsInvoked.add((Utils.classForName(owner.replace("/", ".")), name))
125 |               }
126 |             }
127 |           }
128 |         }
129 |       } else {
130 |         null
131 |       }
132 |     }
133 |   }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.util
 19 | 
 20 | import scala.annotation.tailrec
 21 | import scala.math._
 22 | import scala.reflect.ClassTag
 23 | import scala.util._
 24 | 
 25 | import org.apache.spark._
 26 | import org.apache.spark.serializer._
 27 | import org.apache.spark.rdd.RDD
 28 | import org.apache.spark.SparkContext
 29 | import org.apache.spark.SparkContext._
 30 | import org.apache.spark.graphx._
 31 | import org.apache.spark.graphx.Graph
 32 | import org.apache.spark.graphx.Edge
 33 | import org.apache.spark.graphx.impl.GraphImpl
 34 | 
 35 | /** A collection of graph generating functions. */
 36 | object GraphGenerators extends Logging {
 37 | 
 38 |   val RMATa = 0.45
 39 |   val RMATb = 0.15
 40 |   val RMATd = 0.25
 41 | 
 42 |   /**
 43 |    * Generate a graph whose vertex out degree distribution is log normal.
 44 |    * 生成一个图,它的顶点出度分布是符合对数正态分布的
 45 |    * The default values for mu and sigma are taken from the Pregel paper:
 46 |     *  mu and sigma的值来自Pregel这个论文
 47 |    * Grzegorz Malewicz, Matthew H. Austern, Aart J.C Bik, James C. Dehnert,
 48 |    * Ilan Horn, Naty Leiser, and Grzegorz Czajkowski. 2010.
 49 |    * Pregel: a system for large-scale graph processing. SIGMOD '10.
 50 |    *
 51 |    * If the seed is -1 (default), a random seed is chosen. Otherwise, use
 52 |    * the user-specified seed.
 53 |    *
 54 |    * @param sc Spark Context
 55 |    * @param numVertices number of vertices in generated graph
 56 |     *                    要生成的图的顶点数
 57 |    * @param numEParts (optional) number of partitions
 58 |     *                  分区数量
 59 |    * @param mu (optional, default: 4.0) mean of out-degree distribution
 60 |     *           出度分布的均值,默认是4
 61 |    * @param sigma (optional, default: 1.3) standard deviation of out-degree distribution
 62 |     *              出度分布的标准差,默认是1.3
 63 |    * @param seed (optional, default: -1) seed for RNGs, -1 causes a random seed to be chosen
 64 |     *             随机数据生成器的种子,默认是-1 , -1表示将会使用一个随机的种子
 65 |    * @return Graph object
 66 |    */
 67 |   def logNormalGraph(
 68 |       sc: SparkContext, numVertices: Int, numEParts: Int = 0, mu: Double = 4.0,
 69 |       sigma: Double = 1.3, seed: Long = -1): Graph[Long, Int] = {
 70 | 
 71 |     val evalNumEParts = if (numEParts == 0) sc.defaultParallelism else numEParts
 72 | 
 73 |     // Enable deterministic seeding
 74 |     val seedRand = if (seed == -1) new Random() else new Random(seed)
 75 |     val seed1 = seedRand.nextInt()
 76 |     val seed2 = seedRand.nextInt()
 77 | 
 78 |     val vertices: RDD[(VertexId, Long)] = sc.parallelize(0 until numVertices, evalNumEParts).map {
 79 |       src => (src, sampleLogNormal(mu, sigma, numVertices, seed = (seed1 ^ src)))
 80 |     }
 81 | 
 82 |     val edges = vertices.flatMap { case (src, degree) =>
 83 |       generateRandomEdges(src.toInt, degree.toInt, numVertices, seed = (seed2 ^ src))
 84 |     }
 85 | 
 86 |     Graph(vertices, edges, 0)
 87 |   }
 88 | 
 89 |   // Right now it just generates a bunch of edges where
 90 |   // the edge data is the weight (default 1)
 91 |   val RMATc = 0.15
 92 | 
 93 |   def generateRandomEdges(
 94 |       src: Int, numEdges: Int, maxVertexId: Int, seed: Long = -1): Array[Edge[Int]] = {
 95 |     val rand = if (seed == -1) new Random() else new Random(seed)
 96 |     Array.fill(numEdges) { Edge[Int](src, rand.nextInt(maxVertexId), 1) }
 97 |   }
 98 | 
 99 |   /**
100 |    * Randomly samples from a log normal distribution whose corresponding normal distribution has
101 |    * the given mean and standard deviation. It uses the formula `X = exp(m+s*Z)` where `m`,
102 |    * `s` are the mean, standard deviation of the lognormal distribution and
103 |    * `Z ~ N(0, 1)`. In this function,
104 |    * `m = e^(mu+sigma^2/2)` and `s = sqrt[(e^(sigma^2) - 1)(e^(2*mu+sigma^2))]`.
105 |    *
106 |    * @param mu the mean of the normal distribution
107 |    * @param sigma the standard deviation of the normal distribution
108 |    * @param maxVal exclusive upper bound on the value of the sample
109 |    * @param seed optional seed
110 |    */
111 |   private[spark] def sampleLogNormal(
112 |       mu: Double, sigma: Double, maxVal: Int, seed: Long = -1): Int = {
113 |     val rand = if (seed == -1) new Random() else new Random(seed)
114 | 
115 |     val sigmaSq = sigma * sigma
116 |     val m = math.exp(mu + sigmaSq / 2.0)
117 |     // expm1 is exp(m)-1 with better accuracy for tiny m
118 |     val s = math.sqrt(math.expm1(sigmaSq) * math.exp(2*mu + sigmaSq))
119 |     // Z ~ N(0, 1)
120 |     var X: Double = maxVal
121 | 
122 |     while (X >= maxVal) {
123 |       val Z = rand.nextGaussian()
124 |       X = math.exp(mu + sigma*Z)
125 |     }
126 |     math.floor(X).toInt
127 |   }
128 | 
129 |   /**
130 |    * A random graph generator using the R-MAT model, proposed in
131 |    * "R-MAT: A Recursive Model for Graph Mining" by Chakrabarti et al.
132 |    *
133 |    * See [[http://www.cs.cmu.edu/~christos/PUBLICATIONS/siam04.pdf]].
134 |    */
135 |   def rmatGraph(sc: SparkContext, requestedNumVertices: Int, numEdges: Int): Graph[Int, Int] = {
136 |     // let N = requestedNumVertices
137 |     // the number of vertices is 2^n where n=ceil(log2[N])
138 |     // This ensures that the 4 quadrants are the same size at all recursion levels
139 |     val numVertices = math.round(
140 |       math.pow(2.0, math.ceil(math.log(requestedNumVertices) / math.log(2.0)))).toInt
141 |     val numEdgesUpperBound =
142 |       math.pow(2.0, 2 * ((math.log(numVertices) / math.log(2.0)) - 1)).toInt
143 |     if (numEdgesUpperBound < numEdges) {
144 |       throw new IllegalArgumentException(
145 |         s"numEdges must be <= $numEdgesUpperBound but was $numEdges")
146 |     }
147 |     var edges: Set[Edge[Int]] = Set()
148 |     while (edges.size < numEdges) {
149 |       if (edges.size % 100 == 0) {
150 |         logDebug(edges.size + " edges")
151 |       }
152 |       edges += addEdge(numVertices)
153 |     }
154 |     outDegreeFromEdges(sc.parallelize(edges.toList))
155 |   }
156 | 
157 |   private def outDegreeFromEdges[ED: ClassTag](edges: RDD[Edge[ED]]): Graph[Int, ED] = {
158 |     val vertices = edges.flatMap { edge => List((edge.srcId, 1)) }
159 |       .reduceByKey(_ + _)
160 |       .map{ case (vid, degree) => (vid, degree) }
161 |     Graph(vertices, edges, 0)
162 |   }
163 | 
164 |   /**
165 |    * @param numVertices Specifies the total number of vertices in the graph (used to get
166 |    * the dimensions of the adjacency matrix
167 |    */
168 |   private def addEdge(numVertices: Int): Edge[Int] = {
169 |     // val (src, dst) = chooseCell(numVertices/2.0, numVertices/2.0, numVertices/2.0)
170 |     val v = math.round(numVertices.toFloat/2.0).toInt
171 | 
172 |     val (src, dst) = chooseCell(v, v, v)
173 |     Edge[Int](src, dst, 1)
174 |   }
175 | 
176 |   /**
177 |    * This method recursively subdivides the the adjacency matrix into quadrants
178 |    * until it picks a single cell. The naming conventions in this paper match
179 |    * those of the R-MAT paper. There are a power of 2 number of nodes in the graph.
180 |    * The adjacency matrix looks like:
181 |    * <pre>
182 |    *
183 |    *          dst ->
184 |    * (x,y) ***************  _
185 |    *       |      |      |  |
186 |    *       |  a   |  b   |  |
187 |    *  src  |      |      |  |
188 |    *   |   ***************  | T
189 |    *  \|/  |      |      |  |
190 |    *       |   c  |   d  |  |
191 |    *       |      |      |  |
192 |    *       ***************  -
193 |    * </pre>
194 |    *
195 |    * where this represents the subquadrant of the adj matrix currently being
196 |    * subdivided. (x,y) represent the upper left hand corner of the subquadrant,
197 |    * and T represents the side length (guaranteed to be a power of 2).
198 |    *
199 |    * After choosing the next level subquadrant, we get the resulting sets
200 |    * of parameters:
201 |    * {{{
202 |    *    quad = a, x'=x, y'=y, T'=T/2
203 |    *    quad = b, x'=x+T/2, y'=y, T'=T/2
204 |    *    quad = c, x'=x, y'=y+T/2, T'=T/2
205 |    *    quad = d, x'=x+T/2, y'=y+T/2, T'=T/2
206 |    * }}}
207 |    */
208 |   @tailrec
209 |   private def chooseCell(x: Int, y: Int, t: Int): (Int, Int) = {
210 |     if (t <= 1) {
211 |       (x, y)
212 |     } else {
213 |       val newT = math.round(t.toFloat/2.0).toInt
214 |       pickQuadrant(RMATa, RMATb, RMATc, RMATd) match {
215 |         case 0 => chooseCell(x, y, newT)
216 |         case 1 => chooseCell(x + newT, y, newT)
217 |         case 2 => chooseCell(x, y + newT, newT)
218 |         case 3 => chooseCell(x + newT, y + newT, newT)
219 |       }
220 |     }
221 |   }
222 | 
223 |   // TODO(crankshaw) turn result into an enum (or case class for pattern matching}
224 |   private def pickQuadrant(a: Double, b: Double, c: Double, d: Double): Int = {
225 |     if (a + b + c + d != 1.0) {
226 |       throw new IllegalArgumentException("R-MAT probability parameters sum to " + (a + b + c + d)
227 |         + ", should sum to 1.0")
228 |     }
229 |     val rand = new Random()
230 |     val result = rand.nextDouble()
231 |     result match {
232 |       case x if x < a => 0 // 0 corresponds to quadrant a
233 |       case x if (x >= a && x < a + b) => 1 // 1 corresponds to b
234 |       case x if (x >= a + b && x < a + b + c) => 2 // 2 corresponds to c
235 |       case _ => 3 // 3 corresponds to d
236 |     }
237 |   }
238 | 
239 |   /**
240 |    * Create `rows` by `cols` grid graph with each vertex connected to its
241 |    * row+1 and col+1 neighbors.  Vertex ids are assigned in row major
242 |    * order.
243 |    *
244 |    * @param sc the spark context in which to construct the graph
245 |    * @param rows the number of rows
246 |    * @param cols the number of columns
247 |    *
248 |    * @return A graph containing vertices with the row and column ids
249 |    * as their attributes and edge values as 1.0.
250 |    */
251 |   def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int, Int), Double] = {
252 |     // Convert row column address into vertex ids (row major order)
253 |     def sub2ind(r: Int, c: Int): VertexId = r * cols + c
254 | 
255 |     val vertices: RDD[(VertexId, (Int, Int))] = sc.parallelize(0 until rows).flatMap { r =>
256 |       (0 until cols).map( c => (sub2ind(r, c), (r, c)) )
257 |     }
258 |     val edges: RDD[Edge[Double]] =
259 |       vertices.flatMap{ case (vid, (r, c)) =>
260 |         (if (r + 1 < rows) { Seq( (sub2ind(r, c), sub2ind(r + 1, c))) } else { Seq.empty }) ++
261 |         (if (c + 1 < cols) { Seq( (sub2ind(r, c), sub2ind(r, c + 1))) } else { Seq.empty })
262 |       }.map{ case (src, dst) => Edge(src, dst, 1.0) }
263 |     Graph(vertices, edges)
264 |   } // end of gridGraph
265 | 
266 |   /**
267 |    * Create a star graph with vertex 0 being the center.
268 |    *
269 |    * @param sc the spark context in which to construct the graph
270 |    * @param nverts the number of vertices in the star
271 |    *
272 |    * @return A star graph containing `nverts` vertices with vertex 0
273 |    * being the center vertex.
274 |    */
275 |   def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = {
276 |     val edges: RDD[(VertexId, VertexId)] = sc.parallelize(1 until nverts).map(vid => (vid, 0))
277 |     Graph.fromEdgeTuples(edges, 1)
278 |   } // end of starGraph
279 | 
280 | } // end of Graph Generators
281 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/util/collection/GraphXPrimitiveKeyOpenHashMap.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.graphx.util.collection
 19 | 
 20 | import org.apache.spark.util.collection.OpenHashSet
 21 | 
 22 | import scala.reflect._
 23 | 
 24 | /**
 25 |  * A fast hash map implementation for primitive, non-null keys. This hash map supports
 26 |  * insertions and updates, but not deletions. This map is about an order of magnitude
 27 |  * faster than java.util.HashMap, while using much less space overhead.
 28 |  *
 29 |  * Under the hood, it uses our OpenHashSet implementation.
 30 |  */
 31 | private[graphx]
 32 | class GraphXPrimitiveKeyOpenHashMap[@specialized(Long, Int) K: ClassTag,
 33 |                               @specialized(Long, Int, Double) V: ClassTag](
 34 |     val keySet: OpenHashSet[K], var _values: Array[V])
 35 |   extends Iterable[(K, V)]
 36 |   with Serializable {
 37 | 
 38 |   /**
 39 |    * Allocate an OpenHashMap with a fixed initial capacity
 40 |    */
 41 |   def this(initialCapacity: Int) =
 42 |     this(new OpenHashSet[K](initialCapacity), new Array[V](initialCapacity))
 43 | 
 44 |   /**
 45 |    * Allocate an OpenHashMap with a default initial capacity, providing a true
 46 |    * no-argument constructor.
 47 |    */
 48 |   def this() = this(64)
 49 | 
 50 |   /**
 51 |    * Allocate an OpenHashMap with a fixed initial capacity
 52 |    */
 53 |   def this(keySet: OpenHashSet[K]) = this(keySet, new Array[V](keySet.capacity))
 54 | 
 55 |   require(classTag[K] == classTag[Long] || classTag[K] == classTag[Int])
 56 | 
 57 |   private var _oldValues: Array[V] = null
 58 | 
 59 |   override def size: Int = keySet.size
 60 | 
 61 |   /** Get the value for a given key */
 62 |   def apply(k: K): V = {
 63 |     val pos = keySet.getPos(k)
 64 |     _values(pos)
 65 |   }
 66 | 
 67 |   /** Get the value for a given key, or returns elseValue if it doesn't exist. */
 68 |   def getOrElse(k: K, elseValue: V): V = {
 69 |     val pos = keySet.getPos(k)
 70 |     if (pos >= 0) _values(pos) else elseValue
 71 |   }
 72 | 
 73 |   /** Set the value for a key */
 74 |   def update(k: K, v: V) {
 75 |     val pos = keySet.addWithoutResize(k) & OpenHashSet.POSITION_MASK
 76 |     _values(pos) = v
 77 |     keySet.rehashIfNeeded(k, grow, move)
 78 |     _oldValues = null
 79 |   }
 80 | 
 81 | 
 82 |   /** Set the value for a key */
 83 |   def setMerge(k: K, v: V, mergeF: (V, V) => V) {
 84 |     val pos = keySet.addWithoutResize(k)
 85 |     val ind = pos & OpenHashSet.POSITION_MASK
 86 |     if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) { // if first add
 87 |       _values(ind) = v
 88 |     } else {
 89 |       _values(ind) = mergeF(_values(ind), v)
 90 |     }
 91 |     keySet.rehashIfNeeded(k, grow, move)
 92 |     _oldValues = null
 93 |   }
 94 | 
 95 | 
 96 |   /**
 97 |    * If the key doesn't exist yet in the hash map, set its value to defaultValue; otherwise,
 98 |    * set its value to mergeValue(oldValue).
 99 |    *
100 |    * @return the newly updated value.
101 |    */
102 |   def changeValue(k: K, defaultValue: => V, mergeValue: (V) => V): V = {
103 |     val pos = keySet.addWithoutResize(k)
104 |     if ((pos & OpenHashSet.NONEXISTENCE_MASK) != 0) {
105 |       val newValue = defaultValue
106 |       _values(pos & OpenHashSet.POSITION_MASK) = newValue
107 |       keySet.rehashIfNeeded(k, grow, move)
108 |       newValue
109 |     } else {
110 |       _values(pos) = mergeValue(_values(pos))
111 |       _values(pos)
112 |     }
113 |   }
114 | 
115 |   override def iterator: Iterator[(K, V)] = new Iterator[(K, V)] {
116 |     var pos = 0
117 |     var nextPair: (K, V) = computeNextPair()
118 | 
119 |     /** Get the next value we should return from next(), or null if we're finished iterating */
120 |     def computeNextPair(): (K, V) = {
121 |       pos = keySet.nextPos(pos)
122 |       if (pos >= 0) {
123 |         val ret = (keySet.getValue(pos), _values(pos))
124 |         pos += 1
125 |         ret
126 |       } else {
127 |         null
128 |       }
129 |     }
130 | 
131 |     def hasNext: Boolean = nextPair != null
132 | 
133 |     def next(): (K, V) = {
134 |       val pair = nextPair
135 |       nextPair = computeNextPair()
136 |       pair
137 |     }
138 |   }
139 | 
140 |   // The following member variables are declared as protected instead of private for the
141 |   // specialization to work (specialized class extends the unspecialized one and needs access
142 |   // to the "private" variables).
143 |   // They also should have been val's. We use var's because there is a Scala compiler bug that
144 |   // would throw illegal access error at runtime if they are declared as val's.
145 |   protected var grow = (newCapacity: Int) => {
146 |     _oldValues = _values
147 |     _values = new Array[V](newCapacity)
148 |   }
149 | 
150 |   protected var move = (oldPos: Int, newPos: Int) => {
151 |     _values(newPos) = _oldValues(oldPos)
152 |   }
153 | }
154 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/util/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | /**
19 |  * Collections of utilities used by graphx.
20 |  */
21 | package org.apache.spark.graphx.util;


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/graphx/util/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.graphx
19 | 
20 | /**
21 |  * Collections of utilities used by graphx.
22 |  */
23 | package object util
24 | 


--------------------------------------------------------------------------------