├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── dependencies.R ├── gf_bfs.R ├── gf_connected_components.R ├── gf_examples.R ├── gf_interface.R ├── gf_lpa.R ├── gf_pagerank.R ├── gf_scc.R ├── gf_shortest_paths.R ├── gf_triangle_count.R └── imports.R ├── README.Rmd ├── README.md ├── codecov.yml ├── cran-comments.md ├── graphframes.Rproj ├── man-roxygen ├── roxlate-gf-dots.R ├── roxlate-gf-sc.R └── roxlate-gf-x.R ├── man ├── gf_bfs.Rd ├── gf_cache.Rd ├── gf_chain.Rd ├── gf_connected_components.Rd ├── gf_degrees.Rd ├── gf_edge_columns.Rd ├── gf_edges.Rd ├── gf_find.Rd ├── gf_friends.Rd ├── gf_graphframe.Rd ├── gf_grid_ising_model.Rd ├── gf_in_degrees.Rd ├── gf_lpa.Rd ├── gf_out_degrees.Rd ├── gf_pagerank.Rd ├── gf_persist.Rd ├── gf_register.Rd ├── gf_scc.Rd ├── gf_shortest_paths.Rd ├── gf_star.Rd ├── gf_triangle_count.Rd ├── gf_triplets.Rd ├── gf_two_blobs.Rd ├── gf_unpersist.Rd ├── gf_vertex_columns.Rd ├── gf_vertices.Rd └── spark_graphframe.Rd ├── tests ├── testthat.R └── testthat │ ├── helper-initialize.R │ ├── output │ ├── friends.txt │ ├── gf_bfs.txt │ ├── gf_chain.txt │ ├── gf_degrees.txt │ ├── gf_find.txt │ ├── gf_out_degrees.txt │ ├── gf_pagerank.txt │ ├── gf_shortest_paths.txt │ ├── gf_star.txt │ ├── gf_triangle_count.txt │ ├── triplets.txt │ └── two_blobs.txt │ ├── test-gf-algos.R │ ├── test-gf-examples.R │ └── test-gf-interface.R └── tools └── readme ├── unnamed-chunk-4-1.png └── unnamed-chunk-5-1.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^README\.Rmd$ 5 | ^README_files$ 6 | ^README_images$ 7 | ^configure$ 8 | ^configure\.win$ 9 | ^configure\.R$ 10 | ^examples/ 11 | ^logs 12 | ^demo\.R$ 13 | ^docs/ 14 | ^internal/ 15 | ^man-roxygen/ 16 | ^res/ 17 | ^derby\.log$ 18 | ^metastore_db$ 19 | ^spark-warehouse$ 20 | ^log4j\..*$ 21 | ^run-tests$ 22 | ^.classpath$ 23 | ^.project$ 24 | ^codecov\.yml$ 25 | ^cran-comments\.md$ 26 | ^tests/testthat/logs 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | demo.R 6 | derby.log 7 | log4j.spark.* 8 | internal 9 | .DS_Store 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | 7 | after_success: 8 | - Rscript -e 'covr::codecov()' 9 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: graphframes 2 | Type: Package 3 | Title: Interface for 'GraphFrames' 4 | Version: 0.1.2 5 | Authors@R: person("Kevin", "Kuo", email = "kevin.kuo@rstudio.com", 6 | role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7803-7901")) 7 | Maintainer: Kevin Kuo 8 | Description: A 'sparklyr' extension that provides an R 9 | interface for 'GraphFrames' . 'GraphFrames' is a package 10 | for 'Apache Spark' that provides a DataFrame-based API for working with graphs. Functionality 11 | includes motif finding and common graph algorithms, such as PageRank and Breadth-first 12 | search. 13 | URL: https://github.com/rstudio/graphframes 14 | BugReports: https://github.com/rstudio/graphframes/issues 15 | License: Apache License 2.0 | file LICENSE 16 | Encoding: UTF-8 17 | LazyData: true 18 | RoxygenNote: 6.1.0 19 | Imports: 20 | sparklyr, 21 | tibble, 22 | forge 23 | Suggests: 24 | testthat, 25 | covr, 26 | dplyr 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(gf_register,graphframe) 4 | S3method(gf_register,spark_jobj) 5 | S3method(print,graphframe) 6 | export(gf_bfs) 7 | export(gf_cache) 8 | export(gf_chain) 9 | export(gf_connected_components) 10 | export(gf_degrees) 11 | export(gf_edge_columns) 12 | export(gf_edges) 13 | export(gf_find) 14 | export(gf_friends) 15 | export(gf_graphframe) 16 | export(gf_grid_ising_model) 17 | export(gf_in_degrees) 18 | export(gf_lpa) 19 | export(gf_out_degrees) 20 | export(gf_pagerank) 21 | export(gf_persist) 22 | export(gf_register) 23 | export(gf_scc) 24 | export(gf_shortest_paths) 25 | export(gf_star) 26 | export(gf_triangle_count) 27 | export(gf_triplets) 28 | export(gf_two_blobs) 29 | export(gf_unpersist) 30 | export(gf_vertex_columns) 31 | export(gf_vertices) 32 | export(spark_graphframe) 33 | import(forge) 34 | import(sparklyr) 35 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # graphframes 0.1.2 2 | 3 | * Updated dependency to graphframes 0.6.0, with support for Spark 2.3. 4 | -------------------------------------------------------------------------------- /R/dependencies.R: -------------------------------------------------------------------------------- 1 | spark_dependencies <- function(spark_version, scala_version, ...) { 2 | graphframes_version <- if (spark_version >= "2.4.0") { 3 | "0.7.0" 4 | } else if (spark_version >= "2.2.0") { 5 | "0.6.0" 6 | } else { 7 | "0.5.0" 8 | } 9 | 10 | spark_dependency( 11 | jars = NULL, 12 | packages = c( 13 | sprintf( 14 | "graphframes:graphframes:%s-spark%s-s_%s", 15 | graphframes_version, 16 | spark_version, 17 | scala_version 18 | ) 19 | ) 20 | ) 21 | } 22 | 23 | .onLoad <- function(libname, pkgname) { 24 | sparklyr::register_extension(pkgname) 25 | } 26 | -------------------------------------------------------------------------------- /R/gf_bfs.R: -------------------------------------------------------------------------------- 1 | #' Breadth-first search (BFS) 2 | #' 3 | #' @template roxlate-gf-x 4 | #' 5 | #' @param from_expr Spark SQL expression specifying valid starting vertices for the BFS. 6 | #' @param to_expr Spark SQL expression specifying valid target vertices for the BFS. 7 | #' @param max_path_length Limit on the length of paths. 8 | #' @param edge_filter Spark SQL expression specifying edges which may be used in the search. 9 | #' @template roxlate-gf-dots 10 | #' 11 | #' @examples 12 | #' \dontrun{ 13 | #' g <- gf_friends(sc) 14 | #' gf_bfs(g, from_expr = "name = 'Esther'", to_expr = "age < 32") 15 | #' } 16 | #' @export 17 | gf_bfs <- function(x, 18 | from_expr, 19 | to_expr, 20 | max_path_length = 10, 21 | edge_filter = NULL, ...) { 22 | 23 | from_expr <- cast_string(from_expr) 24 | to_expr <- cast_string(to_expr) 25 | max_path_length <- cast_scalar_integer(max_path_length) 26 | edge_filter <- cast_nullable_string(edge_filter) 27 | 28 | gf <- spark_graphframe(x) 29 | 30 | algo <- gf %>% 31 | invoke("bfs") 32 | 33 | algo <- algo %>% 34 | invoke("fromExpr", from_expr) %>% 35 | invoke("toExpr", to_expr) %>% 36 | invoke("maxPathLength", max_path_length) 37 | 38 | if (!is.null(edge_filter)) 39 | algo <- invoke(algo, "edgeFilter", edge_filter) 40 | 41 | algo %>% 42 | invoke("run") %>% 43 | sdf_register() 44 | } 45 | -------------------------------------------------------------------------------- /R/gf_connected_components.R: -------------------------------------------------------------------------------- 1 | #' Connected components 2 | #' 3 | #' Computes the connected component membership of each vertex and returns a DataFrame 4 | #' of vertex information with each vertex assigned a component ID. 5 | #' 6 | #' @template roxlate-gf-x 7 | #' @param broadcast_threshold Broadcast threshold in propagating component assignments. 8 | #' @param algorithm One of 'graphframes' or 'graphx'. 9 | #' @param checkpoint_interval Checkpoint interval in terms of number of iterations. 10 | #' @template roxlate-gf-dots 11 | #' @examples 12 | #' \dontrun{ 13 | #' # checkpoint directory is required for gf_connected_components() 14 | #' spark_set_checkpoint_dir(sc, tempdir()) 15 | #' g <- gf_friends(sc) 16 | #' gf_connected_components(g) 17 | #' } 18 | #' @export 19 | gf_connected_components <- function(x, 20 | broadcast_threshold = 1000000L, 21 | algorithm = c("graphframes", "graphx"), 22 | checkpoint_interval = 2L, ...) { 23 | algorithm <- match.arg(algorithm) 24 | broadcast_threshold <- cast_scalar_integer(broadcast_threshold) 25 | checkpoint_interval <- cast_scalar_integer(checkpoint_interval) 26 | 27 | gf <- spark_graphframe(x) 28 | 29 | algo <- gf %>% 30 | invoke("connectedComponents") %>% 31 | invoke("setBroadcastThreshold", broadcast_threshold) %>% 32 | invoke("setAlgorithm", algorithm) %>% 33 | invoke("setCheckpointInterval", checkpoint_interval) 34 | 35 | algo %>% 36 | invoke("run") %>% 37 | sdf_register() 38 | } 39 | -------------------------------------------------------------------------------- /R/gf_examples.R: -------------------------------------------------------------------------------- 1 | #' Graph of friends in a social network. 2 | #' 3 | #' @examples 4 | #' \dontrun{ 5 | #' library(sparklyr) 6 | #' sc <- spark_connect(master = "local") 7 | #' gf_friends(sc) 8 | #' } 9 | #' @template roxlate-gf-sc 10 | #' @export 11 | gf_friends <- function(sc) { 12 | examples_graphs(sc) %>% 13 | invoke("friends") %>% 14 | gf_register() 15 | } 16 | 17 | #' Chain graph 18 | #' 19 | #' Returns a chain graph of the given size with Long ID type. 20 | #' The vertex IDs are 0, 1, ..., n-1, and the edges are (0, 1), (1, 2), ...., (n-2, n-1). 21 | #' @template roxlate-gf-sc 22 | #' @param n Size of the graph to return. 23 | #' @examples 24 | #' \dontrun{ 25 | #' gf_chain(sc, 5) 26 | #' } 27 | #' @export 28 | gf_chain <- function(sc, n) { 29 | n <- cast_scalar_integer(n) 30 | examples_graphs(sc) %>% 31 | invoke("chain", n) %>% 32 | gf_register() 33 | } 34 | 35 | #' Generate a grid Ising model with random parameters 36 | #' 37 | #' @details This method generates a grid Ising model with random parameters. Ising models 38 | #' are probabilistic graphical models over binary variables xi. Each binary 39 | #' variable xi corresponds to one vertex, and it may take values -1 or +1. 40 | #' The probability distribution P(X) (over all xi) is parameterized by 41 | #' vertex factors ai and edge factors bij: 42 | #' 43 | #' \deqn{P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ]} 44 | #' 45 | #' @template roxlate-gf-sc 46 | #' @param n Length of one side of the grid. The grid will be of size n x n. 47 | #' @param v_std Standard deviation of normal distribution used to generate vertex factors "a". Default of 1.0. 48 | #' @param e_std Standard deviation of normal distribution used to generate edge factors "b". Default of 1.0. 49 | #' 50 | #' @return GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", 51 | #' "dst", and "b". Edges are directed, but they should be treated as undirected in 52 | #' any algorithms run on this model. Vertex IDs are of the form "i,j". E.g., vertex 53 | #' "1,3" is in the second row and fourth column of the grid. 54 | #' 55 | #' @examples 56 | #' \dontrun{ 57 | #' gf_grid_ising_model(sc, 5) 58 | #' } 59 | #' @export 60 | gf_grid_ising_model <- function(sc, n, v_std = 1, e_std = 1) { 61 | sql_context <- invoke_new(sc, "org.apache.spark.sql.SQLContext", spark_context(sc)) 62 | n <- cast_scalar_integer(n) 63 | v_std <- cast_scalar_double(v_std) 64 | e_std <- cast_scalar_double(e_std) 65 | 66 | examples_graphs(sc) %>% 67 | invoke("gridIsingModel", sql_context, n, v_std, e_std) %>% 68 | gf_register() 69 | } 70 | 71 | #' Generate a star graph 72 | #' 73 | #' Returns a star graph with Long ID type, consisting of a central element 74 | #' indexed 0 (the root) and the n other leaf vertices 1, 2, ..., n. 75 | #' @template roxlate-gf-sc 76 | #' @param n The number of leaves. 77 | #' 78 | #' @examples 79 | #' \dontrun{ 80 | #' gf_star(sc, 5) 81 | #' } 82 | #' @export 83 | gf_star <- function(sc, n) { 84 | n <- cast_scalar_integer(n) 85 | 86 | examples_graphs(sc) %>% 87 | invoke("star", n) %>% 88 | gf_register() 89 | } 90 | 91 | #' Generate two blobs 92 | #' 93 | #' Two densely connected blobs (vertices 0->n-1 and n->2n-1) 94 | #' connected by a single edge (0->n). 95 | #' @template roxlate-gf-sc 96 | #' @param blob_size The size of each blob. 97 | #' 98 | #' @examples 99 | #' \dontrun{ 100 | #' gf_two_blobs(sc, 3) 101 | #' } 102 | #' @export 103 | gf_two_blobs <- function(sc, blob_size) { 104 | blob_size <- cast_scalar_integer(blob_size) 105 | 106 | examples_graphs(sc) %>% 107 | invoke("twoBlobs", blob_size) %>% 108 | gf_register() 109 | } 110 | 111 | examples_graphs <- function(sc) { 112 | invoke_new(sc, "org.graphframes.examples.Graphs") 113 | } 114 | -------------------------------------------------------------------------------- /R/gf_interface.R: -------------------------------------------------------------------------------- 1 | #' Retrieve a GraphFrame 2 | #' 3 | #' @rdname spark_graphframe 4 | #' 5 | #' @param ... additional arguments, not used 6 | #' @export 7 | spark_graphframe <- function(x, ...) { 8 | UseMethod("spark_graphframe") 9 | } 10 | 11 | #' @rdname spark_graphframe 12 | #' @template roxlate-gf-x 13 | #' @export 14 | spark_graphframe <- function(x, ...) { 15 | x$.jobj 16 | } 17 | 18 | new_graphframe <- function(jobj) { 19 | structure( 20 | list( 21 | vertices = jobj %>% 22 | invoke("vertices") %>% 23 | sdf_register(), 24 | edges = jobj %>% 25 | invoke("edges") %>% 26 | sdf_register(), 27 | .jobj = jobj 28 | ), 29 | class = "graphframe" 30 | ) 31 | } 32 | 33 | #' Create a new GraphFrame 34 | #' 35 | #' @param vertices A \code{tbl_spark} representing vertices. 36 | #' @param edges A \code{tbl_psark} representing edges. 37 | #' 38 | #' @examples 39 | #' \dontrun{ 40 | #' library(sparklyr) 41 | #' sc <- spark_connect(master = "local", version = "2.3.0") 42 | #' v_tbl <- sdf_copy_to( 43 | #' sc, data.frame(id = 1:3, name = LETTERS[1:3]) 44 | #' ) 45 | #' e_tbl <- sdf_copy_to( 46 | #' sc, data.frame(src = c(1, 2, 2), dst = c(2, 1, 3), 47 | #' action = c("love", "hate", "follow")) 48 | #' ) 49 | #' gf_graphframe(v_tbl, e_tbl) 50 | #' gf_graphframe(edges = e_tbl) 51 | #' } 52 | #' @export 53 | gf_graphframe <- function(vertices = NULL, edges) { 54 | sc <- edges %>% 55 | spark_dataframe() %>% 56 | spark_connection() 57 | 58 | jobj <- if (is.null(vertices)) { 59 | invoke_static(sc, 60 | "org.graphframes.GraphFrame", 61 | "fromEdges", 62 | spark_dataframe(edges)) 63 | } else { 64 | invoke_new(sc, 65 | "org.graphframes.GraphFrame", 66 | spark_dataframe(vertices), 67 | spark_dataframe(edges)) 68 | } 69 | 70 | new_graphframe(jobj) 71 | } 72 | 73 | #' @export 74 | print.graphframe <- function(x, ...) { 75 | extract_and_print <- function(x) { 76 | output <- utils::capture.output(x) 77 | extracted_output <- paste0(" ", 78 | output[3:length(output)]) 79 | cat(extracted_output, sep = "\n") 80 | } 81 | cat("GraphFrame\n") 82 | cat("Vertices:", sep = "\n") 83 | extract_and_print(tibble::glimpse(x$vertices)) 84 | cat("Edges:", sep = "\n") 85 | extract_and_print(tibble::glimpse(x$edges)) 86 | invisible(x) 87 | } 88 | 89 | #' Extract vertices DataFrame 90 | #' @template roxlate-gf-x 91 | #' @export 92 | gf_vertices <- function(x) { 93 | x$vertices 94 | } 95 | 96 | #' Extract edges DataFrame 97 | #' @template roxlate-gf-x 98 | #' @export 99 | gf_edges <- function(x) { 100 | x$edges 101 | } 102 | 103 | #' Triplets of graph 104 | #' 105 | #' @template roxlate-gf-x 106 | #' @export 107 | gf_triplets <- function(x) { 108 | x %>% 109 | spark_graphframe() %>% 110 | invoke("triplets") %>% 111 | sdf_register() 112 | } 113 | 114 | #' Register a GraphFrame object 115 | #' 116 | #' @template roxlate-gf-x 117 | #' @export 118 | gf_register <- function(x) { 119 | UseMethod("gf_register") 120 | } 121 | 122 | #' @export 123 | gf_register.spark_jobj <- function(x) { 124 | new_graphframe(x) 125 | } 126 | 127 | #' @export 128 | gf_register.graphframe <- function(x) { 129 | x 130 | } 131 | 132 | #' Vertices column names 133 | #' 134 | #' @template roxlate-gf-x 135 | #' @export 136 | gf_vertex_columns <- function(x) { 137 | x %>% 138 | spark_graphframe() %>% 139 | invoke("vertexColumns") %>% 140 | unlist() 141 | } 142 | 143 | #' Edges column names 144 | #' 145 | #' @template roxlate-gf-x 146 | #' @export 147 | gf_edge_columns <- function(x) { 148 | x %>% 149 | spark_graphframe() %>% 150 | invoke("edgeColumns") %>% 151 | unlist() 152 | } 153 | 154 | #' Out-degrees of vertices 155 | #' 156 | #' @template roxlate-gf-x 157 | #' @export 158 | gf_out_degrees <- function(x) { 159 | x %>% 160 | spark_graphframe() %>% 161 | invoke("outDegrees") %>% 162 | sdf_register() 163 | } 164 | 165 | #' In-degrees of vertices 166 | #' 167 | #' @template roxlate-gf-x 168 | #' @export 169 | gf_in_degrees <- function(x) { 170 | x %>% 171 | spark_graphframe() %>% 172 | invoke("inDegrees") %>% 173 | sdf_register() 174 | } 175 | 176 | #' Degrees of vertices 177 | #' 178 | #' @template roxlate-gf-x 179 | #' @export 180 | gf_degrees <- function(x) { 181 | x %>% 182 | spark_graphframe() %>% 183 | invoke("degrees") %>% 184 | sdf_register() 185 | } 186 | 187 | #' Motif finding: Searching the graph for structural patterns 188 | #' 189 | #' Motif finding uses a simple Domain-Specific Language (DSL) for 190 | #' expressing structural queries. For example, 191 | #' gf_find(g, "(a)-[e]->(b); (b)-[e2]->(a)") will search for 192 | #' pairs of vertices a,b connected by edges in both directions. 193 | #' It will return a DataFrame of all such structures in the graph, 194 | #' with columns for each of the named elements (vertices or edges) 195 | #' in the motif. In this case, the returned columns will be in 196 | #' order of the pattern: "a, e, b, e2." 197 | #' 198 | #' @template roxlate-gf-x 199 | #' 200 | #' @param pattern pattern specifying a motif to search for 201 | #' 202 | #' @examples 203 | #' \dontrun{ 204 | #' gf_friends(sc) %>% 205 | #' gf_find("(a)-[e]->(b); (b)-[e2]->(a)") 206 | #' } 207 | #' @export 208 | gf_find <- function(x, pattern) { 209 | pattern <- cast_string(pattern) 210 | 211 | x %>% 212 | spark_graphframe() %>% 213 | invoke("find", pattern) %>% 214 | sdf_register() 215 | } 216 | 217 | #' Cache the GraphFrame 218 | #' 219 | #' @template roxlate-gf-x 220 | #' 221 | #' @export 222 | gf_cache <- function(x) { 223 | x %>% 224 | spark_graphframe() %>% 225 | invoke("cache") %>% 226 | gf_register() 227 | } 228 | 229 | #' Persist the GraphFrame 230 | #' 231 | #' @template roxlate-gf-x 232 | #' 233 | #' @param storage_level The storage level to be used. Please view the 234 | #' \href{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}{Spark Documentation} 235 | #' for information on what storage levels are accepted. 236 | #' 237 | #' @export 238 | gf_persist <- function(x, storage_level = "MEMORY_AND_DISK") { 239 | storage_level <- cast_string(storage_level) 240 | gf <- spark_graphframe(x) 241 | storage_level <- invoke_static( 242 | spark_connection(gf), 243 | "org.apache.spark.storage.StorageLevel", 244 | storage_level 245 | ) 246 | 247 | gf %>% 248 | invoke("persist", storage_level) %>% 249 | gf_register() 250 | } 251 | 252 | #' Unpersist the GraphFrame 253 | #' 254 | #' @template roxlate-gf-x 255 | #' 256 | #' @param blocking whether to block until all blocks are deleted 257 | #' 258 | #' @export 259 | gf_unpersist <- function(x, blocking = FALSE) { 260 | blocking <- cast_scalar_logical(blocking) 261 | 262 | x %>% 263 | spark_graphframe() %>% 264 | invoke("unpersist", blocking) %>% 265 | gf_register() 266 | } 267 | 268 | -------------------------------------------------------------------------------- /R/gf_lpa.R: -------------------------------------------------------------------------------- 1 | #' Label propagation algorithm (LPA) 2 | #' 3 | #' Run static Label Propagation for detecting communities in networks. Each node in the 4 | #' network is initially assigned to its own community. At every iteration, nodes send 5 | #' their community affiliation to all neighbors and update their state to the mode 6 | #' community affiliation of incoming messages. LPA is a standard community detection 7 | #' algorithm for graphs. It is very inexpensive 8 | #' computationally, although (1) convergence is not guaranteed and (2) one can 9 | #' end up with trivial solutions (all nodes are identified into a single community). 10 | #' 11 | #' @template roxlate-gf-x 12 | #' @param max_iter Maximum number of iterations. 13 | #' @template roxlate-gf-dots 14 | #' 15 | #' @examples 16 | #' \dontrun{ 17 | #' g <- gf_friends(sc) 18 | #' gf_lpa(g, max_iter = 5) 19 | #' } 20 | #' @export 21 | gf_lpa <- function(x, max_iter, ...) { 22 | max_iter <- cast_scalar_integer(max_iter) 23 | 24 | gf <- spark_graphframe(x) 25 | 26 | algo <- gf %>% 27 | invoke("labelPropagation") %>% 28 | invoke("maxIter", max_iter) 29 | 30 | algo %>% 31 | invoke("run") %>% 32 | sdf_register() 33 | } 34 | -------------------------------------------------------------------------------- /R/gf_pagerank.R: -------------------------------------------------------------------------------- 1 | #' PageRank 2 | #' 3 | #' @template roxlate-gf-x 4 | #' @param tol Tolerance. 5 | #' @param reset_probability Reset probability. 6 | #' @param max_iter Maximum number of iterations. 7 | #' @param source_id (Optional) Source vertex for a personalized pagerank. 8 | #' @template roxlate-gf-dots 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' g <- gf_friends(sc) 13 | #' gf_pagerank(g, reset_probability = 0.15, tol = 0.01) 14 | #' } 15 | #' @export 16 | gf_pagerank <- function(x, tol = NULL, reset_probability = 0.15, max_iter = NULL, 17 | source_id = NULL, ...) { 18 | tol <- cast_nullable_scalar_double(tol) 19 | reset_probability <- cast_scalar_double(reset_probability) 20 | max_iter <- cast_nullable_scalar_integer(max_iter) 21 | source_id <- cast_nullable_string(source_id) 22 | 23 | gf <- spark_graphframe(x) 24 | 25 | if (is.null(tol) && is.null(max_iter)) 26 | stop("One of 'tol' and 'max_iter' must be specified") 27 | if (!is.null(tol) && !is.null(max_iter)) 28 | stop("You cannot specify both 'tol' and 'max_iter'") 29 | 30 | algo <- gf %>% 31 | invoke("pageRank") %>% 32 | invoke("resetProbability", reset_probability) 33 | 34 | if (!is.null(tol)) 35 | algo <- invoke(algo, "tol", tol) 36 | 37 | if (!is.null(max_iter)) 38 | algo <- invoke(algo, "maxIter", max_iter) 39 | 40 | if (!is.null(source_id)) 41 | algo <- invoke(algo, "sourceId", source_id) 42 | 43 | algo %>% 44 | invoke("run") %>% 45 | gf_register() 46 | } 47 | -------------------------------------------------------------------------------- /R/gf_scc.R: -------------------------------------------------------------------------------- 1 | #' Strongly connected components 2 | #' 3 | #' Compute the strongly connected component (SCC) of each vertex and return a 4 | #' DataFrame with each vertex assigned to the SCC containing that vertex. 5 | #' 6 | #' @template roxlate-gf-x 7 | #' @param max_iter Maximum number of iterations. 8 | #' @template roxlate-gf-dots 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' g <- gf_friends(sc) 13 | #' gf_scc(g, max_iter = 10) 14 | #' } 15 | #' @export 16 | gf_scc <- function(x, max_iter, ...) { 17 | max_iter <- cast_scalar_integer(max_iter) 18 | 19 | gf <- spark_graphframe(x) 20 | 21 | algo <- gf %>% 22 | invoke("stronglyConnectedComponents") %>% 23 | invoke("maxIter", max_iter) 24 | 25 | algo %>% 26 | invoke("run") %>% 27 | sdf_register() 28 | } 29 | -------------------------------------------------------------------------------- /R/gf_shortest_paths.R: -------------------------------------------------------------------------------- 1 | #' Shortest paths 2 | #' 3 | #' Computes shortest paths from every vertex to the given set of landmark vertices. 4 | #' Note that this takes edge direction into account. 5 | #' 6 | #' @template roxlate-gf-x 7 | #' @param landmarks IDs of landmark vertices. 8 | #' @template roxlate-gf-dots 9 | #' 10 | #' @examples 11 | #' \dontrun{ 12 | #' g <- gf_friends(sc) 13 | #' gf_shortest_paths(g, landmarks = c("a", "d")) 14 | #' } 15 | #' @export 16 | gf_shortest_paths <- function(x, landmarks, ...) { 17 | landmarks <- cast_string_list(landmarks) 18 | 19 | gf <- spark_graphframe(x) 20 | 21 | algo <- gf %>% 22 | invoke("shortestPaths") %>% 23 | invoke("landmarks", landmarks) 24 | 25 | algo %>% 26 | invoke("run") %>% 27 | sdf_register() 28 | } 29 | -------------------------------------------------------------------------------- /R/gf_triangle_count.R: -------------------------------------------------------------------------------- 1 | #' Computes the number of triangles passing through each vertex. 2 | #' 3 | #' This algorithm ignores edge direction; i.e., all edges are treated 4 | #' as undirected. In a multigraph, duplicate edges will be counted only once. 5 | #' 6 | #' @template roxlate-gf-x 7 | #' @template roxlate-gf-dots 8 | #' 9 | #' @examples 10 | #' \dontrun{ 11 | #' g <- gf_friends(sc) 12 | #' gf_triangle_count(g) 13 | #' } 14 | #' @export 15 | gf_triangle_count <- function(x, ...) { 16 | gf <- spark_graphframe(x) 17 | 18 | algo <- gf %>% 19 | invoke("triangleCount") 20 | 21 | algo %>% 22 | invoke("run") %>% 23 | sdf_register() 24 | } 25 | -------------------------------------------------------------------------------- /R/imports.R: -------------------------------------------------------------------------------- 1 | #' @import sparklyr 2 | NULL 3 | 4 | #' @import forge 5 | NULL 6 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R interface for GraphFrames" 3 | output: 4 | github_document: 5 | fig_width: 9 6 | fig_height: 5 7 | --- 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(eval = TRUE) 11 | knitr::opts_chunk$set(warning = FALSE) 12 | knitr::opts_chunk$set(fig.path = "tools/readme/", dev = "png") 13 | ``` 14 | 15 | [![Build Status](https://travis-ci.org/rstudio/graphframes.svg?branch=master)](https://travis-ci.org/rstudio/graphframes) [![Coverage status](https://codecov.io/gh/rstudio/graphframes/branch/master/graph/badge.svg)](https://codecov.io/github/rstudio/graphframes?branch=master) [![CRAN status](https://www.r-pkg.org/badges/version/graphframes)](https://cran.r-project.org/package=graphframes) 16 | 17 | - Support for [GraphFrames](https://graphframes.github.io/) which aims to provide the functionality of [GraphX](http://spark.apache.org/graphx/). 18 | - Perform graph algorithms like: [PageRank](https://graphframes.github.io/api/scala/index.html#org.graphframes.lib.PageRank), [ShortestPaths](https://graphframes.github.io/api/scala/index.html#org.graphframes.lib.ShortestPaths) and many [others](https://graphframes.github.io/api/scala/#package). 19 | - Designed to work with [sparklyr](https://spark.rstudio.com) and the [sparklyr extensions](http://spark.rstudio.com/extensions.html). 20 | 21 | ## Installation 22 | 23 | For those already using `sparklyr` simply run: 24 | 25 | ```{r eval=FALSE} 26 | install.packages("graphframes") 27 | # or, for the development version, 28 | # devtools::install_github("rstudio/graphframes") 29 | ``` 30 | 31 | Otherwise, install first `sparklyr` from CRAN using: 32 | 33 | ```{r eval=FALSE} 34 | install.packages("sparklyr") 35 | ``` 36 | 37 | The examples make use of the `highschool` dataset from the `ggplot` package. 38 | 39 | ## Getting Started 40 | 41 | We will calculate [PageRank](https://en.wikipedia.org/wiki/PageRank) over the built-in "friends" dataset as follows. 42 | 43 | ```{r message=FALSE} 44 | library(graphframes) 45 | library(sparklyr) 46 | library(dplyr) 47 | 48 | # connect to spark using sparklyr 49 | sc <- spark_connect(master = "local", version = "2.3.0") 50 | 51 | # obtain the example graph 52 | g <- gf_friends(sc) 53 | 54 | # compute PageRank 55 | results <- gf_pagerank(g, tol = 0.01, reset_probability = 0.15) 56 | results 57 | ``` 58 | 59 | We can then visualize the results by collecting the results to R: 60 | 61 | ```{r, message = FALSE} 62 | library(tidygraph) 63 | library(ggraph) 64 | 65 | vertices <- results %>% 66 | gf_vertices() %>% 67 | collect() 68 | 69 | edges <- results %>% 70 | gf_edges() %>% 71 | collect() 72 | 73 | edges %>% 74 | as_tbl_graph() %>% 75 | activate(nodes) %>% 76 | left_join(vertices, by = c(name = "id")) %>% 77 | ggraph(layout = "nicely") + 78 | geom_node_label(aes(label = name.y, color = pagerank)) + 79 | geom_edge_link( 80 | aes( 81 | alpha = weight, 82 | start_cap = label_rect(node1.name.y), 83 | end_cap = label_rect(node2.name.y) 84 | ), 85 | arrow = arrow(length = unit(4, "mm")) 86 | ) + 87 | theme_graph(fg_text_colour = 'white') 88 | ``` 89 | 90 | ## Further Reading 91 | 92 | Appart from calculating `PageRank` using `gf_pagerank`, many other functions are available, including: 93 | 94 | - `gf_bfs()`: Breadth-first search (BFS). 95 | - `gf_connected_components()`: Connected components. 96 | - `gf_shortest_paths()`: Shortest paths algorithm. 97 | - `gf_scc()`: Strongly connected components. 98 | - `gf_triangle_count()`: Computes the number of triangles passing through each vertex and others. 99 | - `gf_degrees()`: Degrees of vertices 100 | 101 | For instance, one can calculate the degrees of vertices using `gf_degrees` as follows: 102 | 103 | ```{r message=FALSE} 104 | gf_friends(sc) %>% gf_degrees() 105 | ``` 106 | 107 | Finally, we disconnect from Spark: 108 | 109 | ```{r} 110 | spark_disconnect(sc) 111 | ``` 112 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | R interface for GraphFrames 2 | ================ 3 | 4 | [![Build 5 | Status](https://travis-ci.org/rstudio/graphframes.svg?branch=master)](https://travis-ci.org/rstudio/graphframes) 6 | [![Coverage 7 | status](https://codecov.io/gh/rstudio/graphframes/branch/master/graph/badge.svg)](https://codecov.io/github/rstudio/graphframes?branch=master) 8 | [![CRAN 9 | status](https://www.r-pkg.org/badges/version/graphframes)](https://cran.r-project.org/package=graphframes) 10 | 11 | - Support for [GraphFrames](https://graphframes.github.io/) which aims 12 | to provide the functionality of 13 | [GraphX](http://spark.apache.org/graphx/). 14 | - Perform graph algorithms like: 15 | [PageRank](https://graphframes.github.io/api/scala/index.html#org.graphframes.lib.PageRank), 16 | [ShortestPaths](https://graphframes.github.io/api/scala/index.html#org.graphframes.lib.ShortestPaths) 17 | and many [others](https://graphframes.github.io/api/scala/#package). 18 | - Designed to work with [sparklyr](https://spark.rstudio.com) and the 19 | [sparklyr extensions](http://spark.rstudio.com/extensions.html). 20 | 21 | ## Installation 22 | 23 | For those already using `sparklyr` simply run: 24 | 25 | ``` r 26 | install.packages("graphframes") 27 | # or, for the development version, 28 | # devtools::install_github("rstudio/graphframes") 29 | ``` 30 | 31 | Otherwise, install first `sparklyr` from CRAN using: 32 | 33 | ``` r 34 | install.packages("sparklyr") 35 | ``` 36 | 37 | The examples make use of the `highschool` dataset from the `ggplot` 38 | package. 39 | 40 | ## Getting Started 41 | 42 | We will calculate [PageRank](https://en.wikipedia.org/wiki/PageRank) 43 | over the built-in “friends” dataset as follows. 44 | 45 | ``` r 46 | library(graphframes) 47 | library(sparklyr) 48 | library(dplyr) 49 | 50 | # connect to spark using sparklyr 51 | sc <- spark_connect(master = "local", version = "2.3.0") 52 | 53 | # obtain the example graph 54 | g <- gf_friends(sc) 55 | 56 | # compute PageRank 57 | results <- gf_pagerank(g, tol = 0.01, reset_probability = 0.15) 58 | results 59 | ``` 60 | 61 | ## GraphFrame 62 | ## Vertices: 63 | ## $ id "f", "b", "g", "a", "d", "c", "e" 64 | ## $ name "Fanny", "Bob", "Gabby", "Alice", "David", "Charlie",... 65 | ## $ age 36, 36, 60, 34, 29, 30, 32 66 | ## $ pagerank 0.3283607, 2.6555078, 0.1799821, 0.4491063, 0.3283607... 67 | ## Edges: 68 | ## $ src "b", "c", "d", "e", "a", "a", "e", "f" 69 | ## $ dst "c", "b", "a", "f", "e", "b", "d", "c" 70 | ## $ relationship "follow", "follow", "friend", "follow", "friend",... 71 | ## $ weight 1.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 1.0 72 | 73 | We can then visualize the results by collecting the results to R: 74 | 75 | ``` r 76 | library(tidygraph) 77 | library(ggraph) 78 | 79 | vertices <- results %>% 80 | gf_vertices() %>% 81 | collect() 82 | 83 | edges <- results %>% 84 | gf_edges() %>% 85 | collect() 86 | 87 | edges %>% 88 | as_tbl_graph() %>% 89 | activate(nodes) %>% 90 | left_join(vertices, by = c(name = "id")) %>% 91 | ggraph(layout = "nicely") + 92 | geom_node_label(aes(label = name.y, color = pagerank)) + 93 | geom_edge_link( 94 | aes( 95 | alpha = weight, 96 | start_cap = label_rect(node1.name.y), 97 | end_cap = label_rect(node2.name.y) 98 | ), 99 | arrow = arrow(length = unit(4, "mm")) 100 | ) + 101 | theme_graph(fg_text_colour = 'white') 102 | ``` 103 | 104 | ![](tools/readme/unnamed-chunk-4-1.png) 105 | 106 | ## Further Reading 107 | 108 | Appart from calculating `PageRank` using `gf_pagerank`, many other 109 | functions are available, including: 110 | 111 | - `gf_bfs()`: Breadth-first search (BFS). 112 | - `gf_connected_components()`: Connected components. 113 | - `gf_shortest_paths()`: Shortest paths algorithm. 114 | - `gf_scc()`: Strongly connected components. 115 | - `gf_triangle_count()`: Computes the number of triangles passing 116 | through each vertex and others. 117 | - `gf_degrees()`: Degrees of vertices 118 | 119 | For instance, one can calculate the degrees of vertices using 120 | `gf_degrees` as follows: 121 | 122 | ``` r 123 | gf_friends(sc) %>% gf_degrees() 124 | ``` 125 | 126 | ## # Source: spark [?? x 2] 127 | ## id degree 128 | ## * 129 | ## 1 f 2 130 | ## 2 b 3 131 | ## 3 a 3 132 | ## 4 c 3 133 | ## 5 e 3 134 | ## 6 d 2 135 | 136 | Finally, we disconnect from Spark: 137 | 138 | ``` r 139 | spark_disconnect(sc) 140 | ``` 141 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 1% 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local OS X install, R 3.5.1 3 | * ubuntu 14.04 (on travis-ci), R 3.5.1 4 | * win-builder (devel) 5 | 6 | ## R CMD check results 7 | There were no ERRORs or WARNINGs. 8 | -------------------------------------------------------------------------------- /graphframes.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-gf-dots.R: -------------------------------------------------------------------------------- 1 | #' @param ... Optional arguments, currently not used. 2 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-gf-sc.R: -------------------------------------------------------------------------------- 1 | #' @param sc A Spark connection. 2 | -------------------------------------------------------------------------------- /man-roxygen/roxlate-gf-x.R: -------------------------------------------------------------------------------- 1 | #' @param x An object coercable to a GraphFrame (typically, a 2 | #' \code{gf_graphframe}). 3 | -------------------------------------------------------------------------------- /man/gf_bfs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_bfs.R 3 | \name{gf_bfs} 4 | \alias{gf_bfs} 5 | \title{Breadth-first search (BFS)} 6 | \usage{ 7 | gf_bfs(x, from_expr, to_expr, max_path_length = 10, edge_filter = NULL, 8 | ...) 9 | } 10 | \arguments{ 11 | \item{x}{An object coercable to a GraphFrame (typically, a 12 | \code{gf_graphframe}).} 13 | 14 | \item{from_expr}{Spark SQL expression specifying valid starting vertices for the BFS.} 15 | 16 | \item{to_expr}{Spark SQL expression specifying valid target vertices for the BFS.} 17 | 18 | \item{max_path_length}{Limit on the length of paths.} 19 | 20 | \item{edge_filter}{Spark SQL expression specifying edges which may be used in the search.} 21 | 22 | \item{...}{Optional arguments, currently not used.} 23 | } 24 | \description{ 25 | Breadth-first search (BFS) 26 | } 27 | \examples{ 28 | \dontrun{ 29 | g <- gf_friends(sc) 30 | gf_bfs(g, from_expr = "name = 'Esther'", to_expr = "age < 32") 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/gf_cache.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_cache} 4 | \alias{gf_cache} 5 | \title{Cache the GraphFrame} 6 | \usage{ 7 | gf_cache(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Cache the GraphFrame 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_chain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_examples.R 3 | \name{gf_chain} 4 | \alias{gf_chain} 5 | \title{Chain graph} 6 | \usage{ 7 | gf_chain(sc, n) 8 | } 9 | \arguments{ 10 | \item{sc}{A Spark connection.} 11 | 12 | \item{n}{Size of the graph to return.} 13 | } 14 | \description{ 15 | Returns a chain graph of the given size with Long ID type. 16 | The vertex IDs are 0, 1, ..., n-1, and the edges are (0, 1), (1, 2), ...., (n-2, n-1). 17 | } 18 | \examples{ 19 | \dontrun{ 20 | gf_chain(sc, 5) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /man/gf_connected_components.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_connected_components.R 3 | \name{gf_connected_components} 4 | \alias{gf_connected_components} 5 | \title{Connected components} 6 | \usage{ 7 | gf_connected_components(x, broadcast_threshold = 1000000L, 8 | algorithm = c("graphframes", "graphx"), checkpoint_interval = 2L, 9 | ...) 10 | } 11 | \arguments{ 12 | \item{x}{An object coercable to a GraphFrame (typically, a 13 | \code{gf_graphframe}).} 14 | 15 | \item{broadcast_threshold}{Broadcast threshold in propagating component assignments.} 16 | 17 | \item{algorithm}{One of 'graphframes' or 'graphx'.} 18 | 19 | \item{checkpoint_interval}{Checkpoint interval in terms of number of iterations.} 20 | 21 | \item{...}{Optional arguments, currently not used.} 22 | } 23 | \description{ 24 | Computes the connected component membership of each vertex and returns a DataFrame 25 | of vertex information with each vertex assigned a component ID. 26 | } 27 | \examples{ 28 | \dontrun{ 29 | # checkpoint directory is required for gf_connected_components() 30 | spark_set_checkpoint_dir(sc, tempdir()) 31 | g <- gf_friends(sc) 32 | gf_connected_components(g) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /man/gf_degrees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_degrees} 4 | \alias{gf_degrees} 5 | \title{Degrees of vertices} 6 | \usage{ 7 | gf_degrees(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Degrees of vertices 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_edge_columns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_edge_columns} 4 | \alias{gf_edge_columns} 5 | \title{Edges column names} 6 | \usage{ 7 | gf_edge_columns(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Edges column names 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_edges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_edges} 4 | \alias{gf_edges} 5 | \title{Extract edges DataFrame} 6 | \usage{ 7 | gf_edges(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Extract edges DataFrame 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_find.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_find} 4 | \alias{gf_find} 5 | \title{Motif finding: Searching the graph for structural patterns} 6 | \usage{ 7 | gf_find(x, pattern) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{pattern}{pattern specifying a motif to search for} 14 | } 15 | \description{ 16 | Motif finding uses a simple Domain-Specific Language (DSL) for 17 | expressing structural queries. For example, 18 | gf_find(g, "(a)-[e]->(b); (b)-[e2]->(a)") will search for 19 | pairs of vertices a,b connected by edges in both directions. 20 | It will return a DataFrame of all such structures in the graph, 21 | with columns for each of the named elements (vertices or edges) 22 | in the motif. In this case, the returned columns will be in 23 | order of the pattern: "a, e, b, e2." 24 | } 25 | \examples{ 26 | \dontrun{ 27 | gf_friends(sc) \%>\% 28 | gf_find("(a)-[e]->(b); (b)-[e2]->(a)") 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/gf_friends.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_examples.R 3 | \name{gf_friends} 4 | \alias{gf_friends} 5 | \title{Graph of friends in a social network.} 6 | \usage{ 7 | gf_friends(sc) 8 | } 9 | \arguments{ 10 | \item{sc}{A Spark connection.} 11 | } 12 | \description{ 13 | Graph of friends in a social network. 14 | } 15 | \examples{ 16 | \dontrun{ 17 | library(sparklyr) 18 | sc <- spark_connect(master = "local") 19 | gf_friends(sc) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/gf_graphframe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_graphframe} 4 | \alias{gf_graphframe} 5 | \title{Create a new GraphFrame} 6 | \usage{ 7 | gf_graphframe(vertices = NULL, edges) 8 | } 9 | \arguments{ 10 | \item{vertices}{A \code{tbl_spark} representing vertices.} 11 | 12 | \item{edges}{A \code{tbl_psark} representing edges.} 13 | } 14 | \description{ 15 | Create a new GraphFrame 16 | } 17 | \examples{ 18 | \dontrun{ 19 | library(sparklyr) 20 | sc <- spark_connect(master = "local", version = "2.3.0") 21 | v_tbl <- sdf_copy_to( 22 | sc, data.frame(id = 1:3, name = LETTERS[1:3]) 23 | ) 24 | e_tbl <- sdf_copy_to( 25 | sc, data.frame(src = c(1, 2, 2), dst = c(2, 1, 3), 26 | action = c("love", "hate", "follow")) 27 | ) 28 | gf_graphframe(v_tbl, e_tbl) 29 | gf_graphframe(edges = e_tbl) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/gf_grid_ising_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_examples.R 3 | \name{gf_grid_ising_model} 4 | \alias{gf_grid_ising_model} 5 | \title{Generate a grid Ising model with random parameters} 6 | \usage{ 7 | gf_grid_ising_model(sc, n, v_std = 1, e_std = 1) 8 | } 9 | \arguments{ 10 | \item{sc}{A Spark connection.} 11 | 12 | \item{n}{Length of one side of the grid. The grid will be of size n x n.} 13 | 14 | \item{v_std}{Standard deviation of normal distribution used to generate vertex factors "a". Default of 1.0.} 15 | 16 | \item{e_std}{Standard deviation of normal distribution used to generate edge factors "b". Default of 1.0.} 17 | } 18 | \value{ 19 | GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", 20 | "dst", and "b". Edges are directed, but they should be treated as undirected in 21 | any algorithms run on this model. Vertex IDs are of the form "i,j". E.g., vertex 22 | "1,3" is in the second row and fourth column of the grid. 23 | } 24 | \description{ 25 | Generate a grid Ising model with random parameters 26 | } 27 | \details{ 28 | This method generates a grid Ising model with random parameters. Ising models 29 | are probabilistic graphical models over binary variables xi. Each binary 30 | variable xi corresponds to one vertex, and it may take values -1 or +1. 31 | The probability distribution P(X) (over all xi) is parameterized by 32 | vertex factors ai and edge factors bij: 33 | 34 | \deqn{P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ]} 35 | } 36 | \examples{ 37 | \dontrun{ 38 | gf_grid_ising_model(sc, 5) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /man/gf_in_degrees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_in_degrees} 4 | \alias{gf_in_degrees} 5 | \title{In-degrees of vertices} 6 | \usage{ 7 | gf_in_degrees(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | In-degrees of vertices 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_lpa.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_lpa.R 3 | \name{gf_lpa} 4 | \alias{gf_lpa} 5 | \title{Label propagation algorithm (LPA)} 6 | \usage{ 7 | gf_lpa(x, max_iter, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{max_iter}{Maximum number of iterations.} 14 | 15 | \item{...}{Optional arguments, currently not used.} 16 | } 17 | \description{ 18 | Run static Label Propagation for detecting communities in networks. Each node in the 19 | network is initially assigned to its own community. At every iteration, nodes send 20 | their community affiliation to all neighbors and update their state to the mode 21 | community affiliation of incoming messages. LPA is a standard community detection 22 | algorithm for graphs. It is very inexpensive 23 | computationally, although (1) convergence is not guaranteed and (2) one can 24 | end up with trivial solutions (all nodes are identified into a single community). 25 | } 26 | \examples{ 27 | \dontrun{ 28 | g <- gf_friends(sc) 29 | gf_lpa(g, max_iter = 5) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /man/gf_out_degrees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_out_degrees} 4 | \alias{gf_out_degrees} 5 | \title{Out-degrees of vertices} 6 | \usage{ 7 | gf_out_degrees(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Out-degrees of vertices 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_pagerank.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_pagerank.R 3 | \name{gf_pagerank} 4 | \alias{gf_pagerank} 5 | \title{PageRank} 6 | \usage{ 7 | gf_pagerank(x, tol = NULL, reset_probability = 0.15, max_iter = NULL, 8 | source_id = NULL, ...) 9 | } 10 | \arguments{ 11 | \item{x}{An object coercable to a GraphFrame (typically, a 12 | \code{gf_graphframe}).} 13 | 14 | \item{tol}{Tolerance.} 15 | 16 | \item{reset_probability}{Reset probability.} 17 | 18 | \item{max_iter}{Maximum number of iterations.} 19 | 20 | \item{source_id}{(Optional) Source vertex for a personalized pagerank.} 21 | 22 | \item{...}{Optional arguments, currently not used.} 23 | } 24 | \description{ 25 | PageRank 26 | } 27 | \examples{ 28 | \dontrun{ 29 | g <- gf_friends(sc) 30 | gf_pagerank(g, reset_probability = 0.15, tol = 0.01) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /man/gf_persist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_persist} 4 | \alias{gf_persist} 5 | \title{Persist the GraphFrame} 6 | \usage{ 7 | gf_persist(x, storage_level = "MEMORY_AND_DISK") 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{storage_level}{The storage level to be used. Please view the 14 | \href{http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence}{Spark Documentation} 15 | for information on what storage levels are accepted.} 16 | } 17 | \description{ 18 | Persist the GraphFrame 19 | } 20 | -------------------------------------------------------------------------------- /man/gf_register.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_register} 4 | \alias{gf_register} 5 | \title{Register a GraphFrame object} 6 | \usage{ 7 | gf_register(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Register a GraphFrame object 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_scc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_scc.R 3 | \name{gf_scc} 4 | \alias{gf_scc} 5 | \title{Strongly connected components} 6 | \usage{ 7 | gf_scc(x, max_iter, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{max_iter}{Maximum number of iterations.} 14 | 15 | \item{...}{Optional arguments, currently not used.} 16 | } 17 | \description{ 18 | Compute the strongly connected component (SCC) of each vertex and return a 19 | DataFrame with each vertex assigned to the SCC containing that vertex. 20 | } 21 | \examples{ 22 | \dontrun{ 23 | g <- gf_friends(sc) 24 | gf_scc(g, max_iter = 10) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/gf_shortest_paths.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_shortest_paths.R 3 | \name{gf_shortest_paths} 4 | \alias{gf_shortest_paths} 5 | \title{Shortest paths} 6 | \usage{ 7 | gf_shortest_paths(x, landmarks, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{landmarks}{IDs of landmark vertices.} 14 | 15 | \item{...}{Optional arguments, currently not used.} 16 | } 17 | \description{ 18 | Computes shortest paths from every vertex to the given set of landmark vertices. 19 | Note that this takes edge direction into account. 20 | } 21 | \examples{ 22 | \dontrun{ 23 | g <- gf_friends(sc) 24 | gf_shortest_paths(g, landmarks = c("a", "d")) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/gf_star.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_examples.R 3 | \name{gf_star} 4 | \alias{gf_star} 5 | \title{Generate a star graph} 6 | \usage{ 7 | gf_star(sc, n) 8 | } 9 | \arguments{ 10 | \item{sc}{A Spark connection.} 11 | 12 | \item{n}{The number of leaves.} 13 | } 14 | \description{ 15 | Returns a star graph with Long ID type, consisting of a central element 16 | indexed 0 (the root) and the n other leaf vertices 1, 2, ..., n. 17 | } 18 | \examples{ 19 | \dontrun{ 20 | gf_star(sc, 5) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /man/gf_triangle_count.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_triangle_count.R 3 | \name{gf_triangle_count} 4 | \alias{gf_triangle_count} 5 | \title{Computes the number of triangles passing through each vertex.} 6 | \usage{ 7 | gf_triangle_count(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{...}{Optional arguments, currently not used.} 14 | } 15 | \description{ 16 | This algorithm ignores edge direction; i.e., all edges are treated 17 | as undirected. In a multigraph, duplicate edges will be counted only once. 18 | } 19 | \examples{ 20 | \dontrun{ 21 | g <- gf_friends(sc) 22 | gf_triangle_count(g) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /man/gf_triplets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_triplets} 4 | \alias{gf_triplets} 5 | \title{Triplets of graph} 6 | \usage{ 7 | gf_triplets(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Triplets of graph 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_two_blobs.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_examples.R 3 | \name{gf_two_blobs} 4 | \alias{gf_two_blobs} 5 | \title{Generate two blobs} 6 | \usage{ 7 | gf_two_blobs(sc, blob_size) 8 | } 9 | \arguments{ 10 | \item{sc}{A Spark connection.} 11 | 12 | \item{blob_size}{The size of each blob.} 13 | } 14 | \description{ 15 | Two densely connected blobs (vertices 0->n-1 and n->2n-1) 16 | connected by a single edge (0->n). 17 | } 18 | \examples{ 19 | \dontrun{ 20 | gf_two_blobs(sc, 3) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /man/gf_unpersist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_unpersist} 4 | \alias{gf_unpersist} 5 | \title{Unpersist the GraphFrame} 6 | \usage{ 7 | gf_unpersist(x, blocking = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | 13 | \item{blocking}{whether to block until all blocks are deleted} 14 | } 15 | \description{ 16 | Unpersist the GraphFrame 17 | } 18 | -------------------------------------------------------------------------------- /man/gf_vertex_columns.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_vertex_columns} 4 | \alias{gf_vertex_columns} 5 | \title{Vertices column names} 6 | \usage{ 7 | gf_vertex_columns(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Vertices column names 15 | } 16 | -------------------------------------------------------------------------------- /man/gf_vertices.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{gf_vertices} 4 | \alias{gf_vertices} 5 | \title{Extract vertices DataFrame} 6 | \usage{ 7 | gf_vertices(x) 8 | } 9 | \arguments{ 10 | \item{x}{An object coercable to a GraphFrame (typically, a 11 | \code{gf_graphframe}).} 12 | } 13 | \description{ 14 | Extract vertices DataFrame 15 | } 16 | -------------------------------------------------------------------------------- /man/spark_graphframe.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gf_interface.R 3 | \name{spark_graphframe} 4 | \alias{spark_graphframe} 5 | \title{Retrieve a GraphFrame} 6 | \usage{ 7 | spark_graphframe(x, ...) 8 | 9 | spark_graphframe(x, ...) 10 | } 11 | \arguments{ 12 | \item{x}{An object coercable to a GraphFrame (typically, a 13 | \code{gf_graphframe}).} 14 | 15 | \item{...}{additional arguments, not used} 16 | } 17 | \description{ 18 | Retrieve a GraphFrame 19 | } 20 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(graphframes) 3 | 4 | if (identical(Sys.getenv("NOT_CRAN"), "true")) { 5 | test_check("graphframes") 6 | on.exit({spark_disconnect_all()}) 7 | } 8 | -------------------------------------------------------------------------------- /tests/testthat/helper-initialize.R: -------------------------------------------------------------------------------- 1 | "%||%" <- function(x, y) { 2 | if (is.null(x)) y else x 3 | } 4 | 5 | # helper functions from sparklyr tests 6 | # https://github.com/rstudio/sparklyr/blob/master/tests/testthat/helper-initialize.R 7 | testthat_spark_connection <- function() { 8 | version <- Sys.getenv("SPARK_VERSION", unset = "2.3.0") 9 | 10 | spark_installed <- sparklyr::spark_installed_versions() 11 | if (nrow(spark_installed[spark_installed$spark == version, ]) == 0) { 12 | options(sparkinstall.verbose = TRUE) 13 | sparklyr::spark_install(version) 14 | } 15 | 16 | expect_gt(nrow(sparklyr::spark_installed_versions()), 0) 17 | 18 | # generate connection if none yet exists 19 | connected <- FALSE 20 | if (exists(".testthat_spark_connection", envir = .GlobalEnv)) { 21 | sc <- get(".testthat_spark_connection", envir = .GlobalEnv) 22 | connected <- sparklyr::connection_is_open(sc) 23 | } 24 | 25 | if (!connected) { 26 | config <- sparklyr::spark_config() 27 | 28 | options(sparklyr.sanitize.column.names.verbose = TRUE) 29 | options(sparklyr.verbose = TRUE) 30 | options(sparklyr.na.omit.verbose = TRUE) 31 | options(sparklyr.na.action.verbose = TRUE) 32 | 33 | sc <- sparklyr::spark_connect(master = "local", version = version, config = config) 34 | assign(".testthat_spark_connection", sc, envir = .GlobalEnv) 35 | } 36 | 37 | # retrieve spark connection 38 | get(".testthat_spark_connection", envir = .GlobalEnv) 39 | } 40 | 41 | testthat_tbl <- function(name) { 42 | sc <- testthat_spark_connection() 43 | tbl <- tryCatch(dplyr::tbl(sc, name), error = identity) 44 | if (inherits(tbl, "error")) { 45 | data <- eval(as.name(name), envir = parent.frame()) 46 | tbl <- dplyr::copy_to(sc, data, name = name) 47 | } 48 | tbl 49 | } 50 | 51 | skip_unless_verbose <- function(message = NULL) { 52 | message <- message %||% "Verbose test skipped" 53 | verbose <- Sys.getenv("SPARKLYR_TESTS_VERBOSE", unset = NA) 54 | if (is.na(verbose)) skip(message) 55 | invisible(TRUE) 56 | } 57 | 58 | test_requires <- function(...) { 59 | 60 | for (pkg in list(...)) { 61 | if (!require(pkg, character.only = TRUE, quietly = TRUE)) { 62 | fmt <- "test requires '%s' but '%s' is not installed" 63 | skip(sprintf(fmt, pkg, pkg)) 64 | } 65 | } 66 | 67 | invisible(TRUE) 68 | } 69 | -------------------------------------------------------------------------------- /tests/testthat/output/friends.txt: -------------------------------------------------------------------------------- 1 | GraphFrame 2 | Vertices: 3 | $ id "a", "b", "c", "d", "e", "f", "g" 4 | $ name "Alice", "Bob", "Charlie", "David", "Esther", "Fanny", "Gabby" 5 | $ age 34, 36, 30, 29, 32, 36, 60 6 | Edges: 7 | $ src "a", "b", "c", "f", "e", "e", "d", "a" 8 | $ dst "b", "c", "b", "c", "f", "d", "a", "e" 9 | $ relationship "friend", "follow", "follow", "follow", "follow", "fri... 10 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_bfs.txt: -------------------------------------------------------------------------------- 1 | Observations: 1 2 | Variables: 3 3 | $ from [["e", "Esther", 32]] 4 | $ e0 [["e", "d", "friend"]] 5 | $ to [["d", "David", 29]] 6 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_chain.txt: -------------------------------------------------------------------------------- 1 | GraphFrame 2 | Vertices: 3 | $ id 0, 1, 2, 3, 4 4 | Edges: 5 | $ src 0, 1, 2, 3 6 | $ dst 1, 2, 3, 4 7 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_degrees.txt: -------------------------------------------------------------------------------- 1 | # A tibble: 6 x 2 2 | id degree 3 | 4 | 1 a 3 5 | 2 b 3 6 | 3 c 3 7 | 4 d 2 8 | 5 e 3 9 | 6 f 2 10 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_find.txt: -------------------------------------------------------------------------------- 1 | Observations: 2 2 | Variables: 4 3 | $ a [["b", "Bob", 36], ["c", "Charlie", 30]] 4 | $ e [["b", "c", "follow"], ["c", "b", "follow"]] 5 | $ b [["c", "Charlie", 30], ["b", "Bob", 36]] 6 | $ e2 [["c", "b", "follow"], ["b", "c", "follow"]] 7 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_out_degrees.txt: -------------------------------------------------------------------------------- 1 | # A tibble: 6 x 2 2 | id outDegree 3 | 4 | 1 a 2 5 | 2 b 1 6 | 3 c 1 7 | 4 d 1 8 | 5 e 2 9 | 6 f 1 10 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_pagerank.txt: -------------------------------------------------------------------------------- 1 | # A tibble: 7 x 4 2 | id name age pagerank 3 | 4 | 1 a Alice 34 0.449 5 | 2 b Bob 36 2.66 6 | 3 c Charlie 30 2.69 7 | 4 d David 29 0.328 8 | 5 e Esther 32 0.371 9 | 6 f Fanny 36 0.328 10 | 7 g Gabby 60 0.180 11 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_shortest_paths.txt: -------------------------------------------------------------------------------- 1 | # A tibble: 7 x 4 2 | id name age distances 3 | 4 | 1 a Alice 34 5 | 2 b Bob 36 6 | 3 c Charlie 30 7 | 4 d David 29 8 | 5 e Esther 32 9 | 6 f Fanny 36 10 | 7 g Gabby 60 11 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_star.txt: -------------------------------------------------------------------------------- 1 | GraphFrame 2 | Vertices: 3 | $ id 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 4 | Edges: 5 | $ dst 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 6 | $ src 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 7 | -------------------------------------------------------------------------------- /tests/testthat/output/gf_triangle_count.txt: -------------------------------------------------------------------------------- 1 | # A tibble: 7 x 4 2 | count id name age 3 | 4 | 1 1 a Alice 34 5 | 2 0 b Bob 36 6 | 3 0 c Charlie 30 7 | 4 1 d David 29 8 | 5 1 e Esther 32 9 | 6 0 f Fanny 36 10 | 7 0 g Gabby 60 11 | -------------------------------------------------------------------------------- /tests/testthat/output/triplets.txt: -------------------------------------------------------------------------------- 1 | Observations: 8 2 | Variables: 3 3 | $ src [["a", "Alice", 34], ["b", "Bob", 36], ["c", "Charlie", 30], ... 4 | $ edge [["a", "b", "friend"], ["b", "c", "follow"], ["c", "b", "foll... 5 | $ dst [["b", "Bob", 36], ["c", "Charlie", 30], ["b", "Bob", 36], ["... 6 | -------------------------------------------------------------------------------- /tests/testthat/output/two_blobs.txt: -------------------------------------------------------------------------------- 1 | GraphFrame 2 | Vertices: 3 | $ id 0, 1, 2, 3 4 | $ v_attr1 "0", "1", "2", "3" 5 | $ v_attr2 0, 1, 2, 3 6 | Edges: 7 | $ src 0, 0, 1, 1, 2, 2, 3, 3, 0 8 | $ dst 0, 1, 0, 1, 2, 3, 2, 3, 2 9 | $ e_attr1 "0-0", "0-1", "1-0", "1-1", "2-2", "2-3", "3-2", "3-3", "0-2" 10 | -------------------------------------------------------------------------------- /tests/testthat/test-gf-algos.R: -------------------------------------------------------------------------------- 1 | context("gf algorithms") 2 | 3 | sc <- testthat_spark_connection() 4 | test_requires("dplyr") 5 | 6 | test_that("gf_bfs() works", { 7 | expect_known_output( 8 | gf_friends(sc) %>% 9 | gf_bfs(from_expr = "name = 'Esther'", to_expr = "age < 32") %>% 10 | collect() %>% 11 | glimpse(), 12 | "output/gf_bfs.txt", 13 | print = TRUE 14 | ) 15 | }) 16 | 17 | test_that("gf_find() works", { 18 | expect_known_output( 19 | gf_friends(sc) %>% 20 | gf_find("(a)-[e]->(b); (b)-[e2]->(a)") %>% 21 | collect() %>% 22 | glimpse(), 23 | "output/gf_find.txt", 24 | print = TRUE 25 | ) 26 | }) 27 | 28 | test_that("gf_connected_components() works", { 29 | spark_set_checkpoint_dir(sc, tempdir()) 30 | expect_identical( 31 | gf_friends(sc) %>% 32 | gf_connected_components() %>% 33 | pull(component) %>% 34 | unique() %>% 35 | length(), 36 | 2L 37 | ) 38 | }) 39 | 40 | test_that("gf_scc() works", { 41 | expect_identical( 42 | gf_friends(sc) %>% 43 | gf_scc(max_iter = 10) %>% 44 | pull(component) %>% 45 | unique() %>% 46 | length(), 47 | 4L 48 | ) 49 | }) 50 | 51 | test_that("gf_pagerank() works", { 52 | expect_known_output( 53 | gf_friends(sc) %>% 54 | gf_pagerank(reset_probability = 0.15, tol = 0.01) %>% 55 | gf_vertices() %>% 56 | collect() %>% 57 | arrange(id, name), 58 | "output/gf_pagerank.txt", 59 | print = TRUE 60 | ) 61 | }) 62 | 63 | test_that("gf_shortest_paths() works", { 64 | expect_known_output( 65 | gf_friends(sc) %>% 66 | gf_shortest_paths(landmarks = c("a", "d")) %>% 67 | collect() %>% 68 | arrange(id, name), 69 | "output/gf_shortest_paths.txt", 70 | print = TRUE 71 | ) 72 | }) 73 | 74 | test_that("gf_triangle_count() works", { 75 | expect_known_output( 76 | gf_friends(sc) %>% 77 | gf_triangle_count() %>% 78 | collect() %>% 79 | arrange(id, name), 80 | "output/gf_triangle_count.txt", 81 | print = TRUE 82 | ) 83 | }) 84 | 85 | test_that("gf_lpa() works", { 86 | lpa_result <- gf_friends(sc) %>% 87 | gf_lpa(max_iter = 5) %>% 88 | collect() 89 | expect_identical(dim(lpa_result), 90 | c(7L, 4L)) 91 | expect_identical(names(lpa_result), 92 | c("id", "name", "age", "label")) 93 | }) 94 | 95 | -------------------------------------------------------------------------------- /tests/testthat/test-gf-examples.R: -------------------------------------------------------------------------------- 1 | context("gf examples") 2 | 3 | sc <- testthat_spark_connection() 4 | test_requires("dplyr") 5 | 6 | test_that("gf_star() works", { 7 | expect_known_output( 8 | gf_star(sc, n = 10), 9 | "output/gf_star.txt", 10 | print = TRUE 11 | ) 12 | }) 13 | 14 | test_that("gf_chain() works", { 15 | expect_known_output( 16 | gf_chain(sc, n = 5), 17 | "output/gf_chain.txt", 18 | print = TRUE 19 | ) 20 | }) 21 | 22 | test_that("gf_grid_ising_model() works", { 23 | grid_ising_model <- gf_grid_ising_model(sc, n = 5) 24 | expect_identical( 25 | grid_ising_model %>% 26 | gf_edge_columns(), 27 | c("src", "dst", "b") 28 | ) 29 | expect_identical( 30 | grid_ising_model %>% 31 | gf_vertex_columns(), 32 | c("i", "j", "id", "a") 33 | ) 34 | }) 35 | 36 | test_that("gf_two_blobs() works", { 37 | expect_known_output( 38 | gf_two_blobs(sc, 2), 39 | "output/two_blobs.txt", 40 | print = TRUE 41 | ) 42 | }) 43 | -------------------------------------------------------------------------------- /tests/testthat/test-gf-interface.R: -------------------------------------------------------------------------------- 1 | context("gf interface") 2 | 3 | sc <- testthat_spark_connection() 4 | test_requires("dplyr") 5 | 6 | v <- data_frame(id = 1:3, name = LETTERS[1:3]) 7 | e <- data_frame(src = c(1, 2, 2), dst = c(2, 1, 3), 8 | action = c("love", "hate", "follow")) 9 | v_tbl <- testthat_tbl("v") 10 | e_tbl <- testthat_tbl("e") 11 | 12 | test_that("construction from DataFrame works", { 13 | g <- gf_graphframe(v_tbl, e_tbl) 14 | 15 | expect_equal(g %>% gf_vertices() %>% collect(), v) 16 | expect_equal(g %>% gf_edges() %>% collect(), e) 17 | }) 18 | 19 | test_that("construction from edge frame works", { 20 | g <- gf_graphframe(edges = e_tbl) 21 | 22 | ids_from_vertices <- g %>% 23 | gf_vertices() %>% 24 | collect() %>% 25 | unlist(use.names = FALSE) 26 | 27 | ids_from_edges <- g %>% 28 | gf_edges() %>% 29 | collect() %>% 30 | select(src, dst) %>% 31 | unlist(use.names = FALSE) %>% 32 | unique() 33 | 34 | expect_true(setequal(ids_from_vertices, ids_from_edges)) 35 | }) 36 | 37 | test_that("printing graphframes", { 38 | expect_known_output(gf_friends(sc), 39 | "output/friends.txt", 40 | print = TRUE) 41 | }) 42 | 43 | test_that("gf_triplets() works", { 44 | expect_known_output( 45 | gf_friends(sc) %>% 46 | gf_triplets() %>% 47 | collect() %>% 48 | glimpse(), 49 | "output/triplets.txt", 50 | print = TRUE,) 51 | }) 52 | 53 | test_that("gf_vertex_columns() works", { 54 | expect_identical( 55 | gf_friends(sc) %>% 56 | gf_vertex_columns(), 57 | c("id", "name", "age") 58 | ) 59 | }) 60 | 61 | test_that("gf_edge_columns() works", { 62 | expect_identical( 63 | gf_friends(sc) %>% 64 | gf_edge_columns(), 65 | c("src", "dst", "relationship") 66 | ) 67 | }) 68 | 69 | test_that("gf_out_degrees() works", { 70 | expect_known_output( 71 | gf_friends(sc) %>% 72 | gf_out_degrees() %>% 73 | collect() %>% 74 | arrange(id), 75 | "output/gf_out_degrees.txt", 76 | print = TRUE 77 | ) 78 | }) 79 | 80 | test_that("gf_degrees() works", { 81 | expect_known_output( 82 | gf_friends(sc) %>% 83 | gf_degrees() %>% 84 | collect() %>% 85 | arrange(id), 86 | "output/gf_degrees.txt", 87 | print = TRUE 88 | ) 89 | }) 90 | -------------------------------------------------------------------------------- /tools/readme/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/graphframes/444ed9d9a04fd11eaf504ba164f6502c25cca91b/tools/readme/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /tools/readme/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/graphframes/444ed9d9a04fd11eaf504ba164f6502c25cca91b/tools/readme/unnamed-chunk-5-1.png --------------------------------------------------------------------------------