├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R ├── aPPR-package.R ├── aPPR.R ├── abstract-graph.R ├── graph-igraph.R ├── graph-rtweet.R ├── tracker.R └── update.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── aPPR.Rproj ├── codecov.yml ├── cran-comments.md ├── man ├── Tracker.Rd ├── aPPR-package.Rd ├── abstract_graph.Rd ├── appr.Rd ├── check.Rd ├── neighborhood.Rd ├── node_degrees.Rd ├── rtweet_graph.Rd └── update.Tracker.Rd ├── tests ├── testthat.R └── testthat │ └── test-matches-igraph.R └── vignettes ├── .gitignore └── extending-appr.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^aPPR\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^LICENSE\.md$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^codecov\.yml$ 9 | ^\.github$ 10 | ^cran-comments\.md$ 11 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | branches: [main, master] 12 | 13 | name: R-CMD-check 14 | 15 | jobs: 16 | R-CMD-check: 17 | runs-on: ${{ matrix.config.os }} 18 | 19 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 20 | 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | config: 25 | - {os: macos-latest, r: 'release'} 26 | 27 | - {os: windows-latest, r: 'release'} 28 | # use 4.1 to check with rtools40's older compiler 29 | - {os: windows-latest, r: '4.1'} 30 | 31 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 32 | - {os: ubuntu-latest, r: 'release'} 33 | - {os: ubuntu-latest, r: 'oldrel-1'} 34 | - {os: ubuntu-latest, r: 'oldrel-2'} 35 | - {os: ubuntu-latest, r: 'oldrel-3'} 36 | 37 | env: 38 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 39 | R_KEEP_PKG_SOURCE: yes 40 | 41 | steps: 42 | - uses: actions/checkout@v3 43 | 44 | - uses: r-lib/actions/setup-pandoc@v2 45 | 46 | - uses: r-lib/actions/setup-r@v2 47 | with: 48 | r-version: ${{ matrix.config.r }} 49 | http-user-agent: ${{ matrix.config.http-user-agent }} 50 | use-public-rspm: true 51 | 52 | - uses: r-lib/actions/setup-r-dependencies@v2 53 | with: 54 | extra-packages: any::rcmdcheck 55 | needs: check 56 | 57 | - uses: r-lib/actions/check-r-package@v2 58 | with: 59 | upload-snapshots: true 60 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v3 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.4.1 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: Commands 8 | 9 | jobs: 10 | document: 11 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} 12 | name: document 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | steps: 17 | - uses: actions/checkout@v3 18 | 19 | - uses: r-lib/actions/pr-fetch@v2 20 | with: 21 | repo-token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - uses: r-lib/actions/setup-r@v2 24 | with: 25 | use-public-rspm: true 26 | 27 | - uses: r-lib/actions/setup-r-dependencies@v2 28 | with: 29 | extra-packages: any::roxygen2 30 | needs: pr-document 31 | 32 | - name: Document 33 | run: roxygen2::roxygenise() 34 | shell: Rscript {0} 35 | 36 | - name: commit 37 | run: | 38 | git config --local user.name "$GITHUB_ACTOR" 39 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 40 | git add man/\* NAMESPACE 41 | git commit -m 'Document' 42 | 43 | - uses: r-lib/actions/pr-push@v2 44 | with: 45 | repo-token: ${{ secrets.GITHUB_TOKEN }} 46 | 47 | style: 48 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} 49 | name: style 50 | runs-on: ubuntu-latest 51 | env: 52 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 53 | steps: 54 | - uses: actions/checkout@v3 55 | 56 | - uses: r-lib/actions/pr-fetch@v2 57 | with: 58 | repo-token: ${{ secrets.GITHUB_TOKEN }} 59 | 60 | - uses: r-lib/actions/setup-r@v2 61 | 62 | - name: Install dependencies 63 | run: install.packages("styler") 64 | shell: Rscript {0} 65 | 66 | - name: Style 67 | run: styler::style_pkg() 68 | shell: Rscript {0} 69 | 70 | - name: commit 71 | run: | 72 | git config --local user.name "$GITHUB_ACTOR" 73 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 74 | git add \*.R 75 | git commit -m 'Style' 76 | 77 | - uses: r-lib/actions/pr-push@v2 78 | with: 79 | repo-token: ${{ secrets.GITHUB_TOKEN }} 80 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | jobs: 12 | test-coverage: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::covr 27 | needs: coverage 28 | 29 | - name: Test coverage 30 | run: | 31 | covr::codecov( 32 | quiet = FALSE, 33 | clean = FALSE, 34 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 35 | ) 36 | shell: Rscript {0} 37 | 38 | - name: Show testthat output 39 | if: always() 40 | run: | 41 | ## -------------------------------------------------------------------- 42 | find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true 43 | shell: bash 44 | 45 | - name: Upload test results 46 | if: failure() 47 | uses: actions/upload-artifact@v3 48 | with: 49 | name: coverage-test-failures 50 | path: ${{ runner.temp }}/package 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | inst/doc 5 | docs 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: aPPR 2 | Title: Approximate Personalized PageRank 3 | Version: 0.0.0.9200 4 | Authors@R: c( 5 | person("Alex", "Hayes", , "alexpghayes@gmail.com", role = c("aut", "cre", "cph"), 6 | comment = c(ORCID = "0000-0002-4985-5160")), 7 | person("Fan", "Chen", , "fan.chen@wisc.edu", role = "aut", 8 | comment = c(ORCID = "0000-0003-4508-6023")), 9 | person("Karl", "Rohe", role = "aut") 10 | ) 11 | Description: Calculates approximate and regularized personalized PageRank 12 | vectors for massive graphs, including those that can only be queried 13 | via an API. Regularization allows discovery of community structure 14 | under some stochastic block models. 15 | License: MIT + file LICENSE 16 | URL: https://rohelab.github.io/aPPR/, https://github.com/RoheLab/aPPR 17 | BugReports: https://github.com/RoheLab/aPPR/issues 18 | Imports: 19 | ellipsis, 20 | glue, 21 | logger, 22 | memoise, 23 | pander, 24 | R6, 25 | tibble 26 | Suggests: 27 | covr, 28 | igraph (>= 1.2.5), 29 | knitr, 30 | rentrez, 31 | rmarkdown, 32 | rtweet (>= 0.7.0.9011), 33 | testthat (>= 3.0.0) 34 | Remotes: 35 | ropensci/rtweet 36 | Encoding: UTF-8 37 | LazyData: true 38 | Roxygen: list(markdown = TRUE) 39 | RoxygenNote: 7.2.3 40 | Collate: 41 | 'aPPR-package.R' 42 | 'abstract-graph.R' 43 | 'aPPR.R' 44 | 'graph-igraph.R' 45 | 'graph-rtweet.R' 46 | 'tracker.R' 47 | 'update.R' 48 | Config/testthat/edition: 3 49 | VignetteBuilder: knitr 50 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2022 2 | COPYRIGHT HOLDER: aPPR authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 aPPR authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(appr,abstract_graph) 4 | S3method(appr,igraph) 5 | S3method(appr,rtweet_graph) 6 | S3method(print,abstract_graph) 7 | S3method(update,Tracker) 8 | export(abstract_graph) 9 | export(appr) 10 | export(check) 11 | export(neighborhood) 12 | export(node_degrees) 13 | export(rtweet_graph) 14 | import(logger) 15 | import(pander) 16 | importFrom(R6,R6Class) 17 | importFrom(glue,glue) 18 | importFrom(memoise,memoise) 19 | importFrom(tibble,tibble) 20 | -------------------------------------------------------------------------------- /R/aPPR-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | # The following block is used by usethis to automatically manage 5 | # roxygen namespace tags. Modify with care! 6 | ## usethis namespace: start 7 | ## usethis namespace: end 8 | NULL 9 | 10 | #' @import logger 11 | #' @importFrom glue glue 12 | NULL 13 | 14 | #' @import pander 15 | .onLoad <- function(libname, pkgname) { 16 | log_formatter(formatter_pander, namespace = pkgname) 17 | } 18 | -------------------------------------------------------------------------------- /R/aPPR.R: -------------------------------------------------------------------------------- 1 | #' Approximate personalized pageranks 2 | #' 3 | #' Computes the personalized pagerank for specified seeds using the 4 | #' `ApproximatePageRank` algorithm of Andersen et al. (2006). Computes 5 | #' degree-adjustments and degree-regularization of personalized 6 | #' pagerank vectors as described in Algorithms 3 and 4 of Chen et al. (2019). 7 | #' These algorithms are randomized; if results are unstable across 8 | #' multiple runs, decrease `epsilon`. 9 | #' 10 | #' @param graph An [abstract_graph()] object, such as that created by 11 | #' [rtweet_graph()]. This argument is required. 12 | #' 13 | #' @param seeds A character vector of seeds for the personalized pagerank. 14 | #' The personalized pagerank will return to each of these seeds with 15 | #' probability `alpha` at each node transition. At the moment, 16 | #' all seeds are given equal weighting. This argument is required. 17 | #' 18 | #' @param alpha Teleportation constant. The teleportation constant is the 19 | #' probability of returning to a seed node at each node transition. 20 | #' `alpha` must be a valid probabilty; that is, between zero and one. 21 | #' Defaults to `0.15`. This is the inverse of the "dampening factor" 22 | #' in the original PageRank paper, so `alpha = 0.15` corresponds 23 | #' to a dampening factor of `0.85`. Runtime is proportional to 24 | #' `1 / (epsilon * alpha)`, so small `alpha` can result in long 25 | #' runtimes. 26 | #' 27 | #' @param epsilon Desired accuracy of approximation. `epsilon` must be 28 | #' a small positive number. Defaults to `1e-6`. `aPPR` guarantees that 29 | #' approximated personalized pageranks are uniformly within `epsilon` of 30 | #' their true value. That is, the approximation is guaranteed to be good 31 | #' in an L-infinity sense. This does not guarantee, however, that 32 | #' a ranking of nodes by aPPR is close to a ranking of nodes by PPR. 33 | #' 34 | #' For Twitter graphs, we recommend testing your code with `1e-4` or `1e-5`, 35 | #' using `1e-6` for exploration, and `1e-7` to `1e-8` for final results, 36 | #' although these numbers are very rough. It also perfectly reasonable 37 | #' to run `aPPR` for a given number of steps (set via `max_visits`), 38 | #' and then note the approximation accuracy of your results. Internally, 39 | #' `aPPR` keeps a running estimate of achieved accuracy that is always valid. 40 | #' 41 | #' Anytime you would like to explore more of the graph, you can simply 42 | #' decrease `epsilon`. So you can start with `epsilon = 1e-5` and then 43 | #' gradually decrease `epsilon` until you have a sample of the graph 44 | #' that you are happy with. 45 | #' 46 | #' Also note that runtime is proportional to `1 / (epsilon * alpha)`, 47 | #' so small `epsilon` can result in long runtimes. 48 | #' 49 | #' @param tau Regularization term. Additionally inflates the in-degree 50 | #' of each observation by this term by performing the degree 51 | #' adjustment described in Algorithm 3 and Algorithm 4, which 52 | #' are described in `vignette("Mathematical details")`. Defaults to 53 | #' `NULL`, in which case `tau` is set to the average in-degree of 54 | #' the observed nodes. In general, setting it's reasonable to 55 | #' set `tau` to the average in-degree of the graph. 56 | #' 57 | #' @param max_visits Maximum number of unique nodes to visit. Should be a 58 | #' positive integer. Defaults to `Inf`, such that there is no upper bound 59 | #' on the number of unique nodes to visit. Useful when you want to specify a 60 | #' fixed amount of computation (or API calls) to use rather than an 61 | #' error tolerance. We recommend debugging with `max_visits ~ 20`, 62 | #' exploration with `max_visits` in the hundreds, and `max_visits` in the 63 | #' thousands to ten of thousands for precise results, although this is a 64 | #' very rough heuristic. 65 | #' 66 | #' @param ... Ignored. Passing arguments to `...` results in a warning. 67 | #' 68 | #' 69 | #' @return A [Tracker()] object. Most relevant is the `stats` field, 70 | #' a [tibble::tibble()] with the following columns: 71 | #' 72 | #' - `name`: Name of a node (character). 73 | #' - `p`: Current estimate of residual per out-degree for a node. 74 | #' - `r`: Estimated error of pagerank estimate for a node. 75 | #' - `in_degree`: Number of incoming edges to a node. 76 | #' - `out_degree`: Number of outcoming edges from a node. 77 | #' - `degree_adjusted`: The personalized pagerank divided by the 78 | #' node in-degree. 79 | #' - `regularized`: The personalized pagerank divide by the node 80 | #' in-degree plus `tau`. 81 | #' 82 | #' When computing personalized pageranks for Twitter users (either 83 | #' via [rtweet_graph()], `name` is given 84 | #' as a user ID, not a screen name, regardless of how the seed nodes 85 | #' were specified. 86 | #' 87 | #' @export 88 | #' 89 | #' @references 90 | #' 91 | #' 1. Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349. 92 | #' 2. Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44. 93 | #' 94 | #' @examples 95 | #' 96 | #' library(aPPR) 97 | #' library(igraph) 98 | #' 99 | #' set.seed(27) 100 | #' 101 | #' graph <- rtweet_graph() 102 | #' 103 | #' \dontrun{ 104 | #' appr(graph, "alexpghayes") 105 | #' } 106 | #' 107 | #' graph2 <- sample_pa(100) 108 | #' 109 | #' # this creates a Tracker object 110 | #' ppr_results <- appr(graph2, seeds = "5") 111 | #' 112 | #' # the portion of the Tracker object you probably care about 113 | #' ppr_results$stats 114 | #' 115 | appr <- function(graph, seeds, ..., alpha = 0.15, epsilon = 1e-6, tau = NULL, 116 | max_visits = Inf) { 117 | ellipsis::check_dots_used() 118 | 119 | if (alpha <= 0 || alpha >= 1) 120 | stop("`alpha` must be strictly between zero and one.", call. = FALSE) 121 | 122 | if (epsilon <= 0 || epsilon >= 1) 123 | stop("`epsilon` must be strictly between zero and one.", call. = FALSE) 124 | 125 | if (!is.null(tau) && tau < 0) 126 | stop("`tau` must be greater than zero.", call. = FALSE) 127 | 128 | UseMethod("appr") 129 | } 130 | 131 | #' @include abstract-graph.R 132 | #' @export 133 | appr.abstract_graph <- function(graph, seeds, ..., alpha = 0.15, 134 | epsilon = 1e-6, tau = NULL, 135 | max_visits = Inf) { 136 | tracker <- Tracker$new(graph, alpha, epsilon, tau, max_visits) 137 | 138 | log_debug("Checking seed nodes ... ") 139 | good_seeds <- check(graph, seeds) 140 | log_debug(glue("Checking seed nodes ... good_seeds: {good_seeds}")) 141 | log_debug("Checking seed nodes ... done") 142 | 143 | for (seed in seeds) { 144 | 145 | if (!(seed %in% good_seeds)) { 146 | stop( 147 | glue("Seed {seed} must be available and have positive out degree."), 148 | call. = FALSE 149 | ) 150 | } 151 | 152 | log_info(glue("Adding seed {seed} to tracker ...")) 153 | tracker$add_seed(seed, preference = 1 / length(seeds)) 154 | log_info(glue("Adding seed {seed} to tracker ... done")) 155 | 156 | } 157 | 158 | tracker$calculate_ppr() 159 | tracker$regularize() 160 | tracker 161 | } 162 | -------------------------------------------------------------------------------- /R/abstract-graph.R: -------------------------------------------------------------------------------- 1 | #' Create an abstract graph object 2 | #' 3 | #' Could be an actual graph object, or a graph such as the Twitter 4 | #' following network defined implicitly via API requests, etc. 5 | #' The abstract graph is just a list with `abstract_graph` class 6 | #' and your desired subclass. 7 | #' 8 | #' @param subclass Desired subclass (character). 9 | #' @param ... Other arguments to pass to `list()`. See 10 | #' [rtweet_graph()] for an example. 11 | #' 12 | #' @export 13 | abstract_graph <- function(subclass, ...) { 14 | graph <- list(...) 15 | class(graph) <- c(subclass, "abstract_graph") 16 | graph 17 | } 18 | 19 | #' Check if a node an abstract graph is acceptable for inclusion in PPR 20 | #' 21 | #' Inclusion criteria: 22 | #' 23 | #' - At least one outgoing edge 24 | #' - Can get in degree and out degree of node 25 | #' - Can get all nodes connected to `node` / the 1-hop neighborhood 26 | #' 27 | #' @param graph A graph object. 28 | #' @param nodes The name(s) of node(s) in `graph` as a character vector. 29 | #' 30 | #' @return The subset of `nodes` that are acceptable for inclusion. This 31 | #' can be a character vector of length zero if necessary. It is critical 32 | #' that no entries of `nodes` are duplicated in this output, so we 33 | #' recommend calling `unique()` if there is any potential for repeats 34 | #' in your checking good. 35 | #' 36 | #' @export 37 | check <- function(graph, nodes) { 38 | UseMethod("check") 39 | } 40 | 41 | #' Get the in-degree and out-degree of nodes in an abstract graph 42 | #' 43 | #' This function is only called nodes that have been [check()]'d. It is 44 | #' safe to assume that `nodes` is non-empty. 45 | #' 46 | #' @param graph A graph object. 47 | #' @param nodes The name(s) of node(s) in `graph` as a character vector. 48 | #' Methods may assume that there are no repeated values in `nodes`. 49 | #' 50 | #' @return A [data.frame()] with one row for every node in `nodes` and 51 | #' two columns: `in_degree` and `out_degree`. In a symmetric graph, 52 | #' `in_degree` and `out_degree` should match. 53 | #' 54 | #' @export 55 | node_degrees <- function(graph, nodes) { 56 | UseMethod("node_degrees") 57 | } 58 | 59 | #' Get the neighborhood of a node in a graph 60 | #' 61 | #' That is, find all nodes connected to `node` by an outgoing edge. 62 | #' This function is memorized to avoid making repeated API queries. 63 | #' 64 | #' @param graph A graph object. 65 | #' @param node The name of a single node in `graph` as a character vector. 66 | #' 67 | #' @return A character vector of all nodes in `graph` connected such that 68 | #' there is an outgoing edge for `node` to those nodes. This should 69 | #' never be empty, as `neighborhood()` should not be called on nodes 70 | #' that fail `check()`, and `check()` enforces that nodes have out-degree 71 | #' of at least one. It is critical node names are duplicated in the 72 | #' output recommend calling `unique()` if there is any potential for 73 | #' for that to occur. 74 | #' 75 | #' @export 76 | neighborhood <- function(graph, node) { 77 | 78 | if (length(node) != 1) 79 | stop("`node` must be a character vector of length 1L.", call. = FALSE) 80 | 81 | UseMethod("neighborhood") 82 | } 83 | 84 | # memoized versions, these are what actually get used 85 | #' @importFrom memoise memoise 86 | memo_neighborhood <- memoise::memoise(neighborhood) 87 | 88 | #' @method print abstract_graph 89 | #' @export 90 | print.abstract_graph <- function(x, ...) { 91 | cat(glue("Abstract graph object (subclass: {class(x)[1]})\n")) 92 | } 93 | -------------------------------------------------------------------------------- /R/graph-igraph.R: -------------------------------------------------------------------------------- 1 | 2 | #' @rdname appr 3 | #' @export 4 | appr.igraph <- function(graph, seeds, ...) { 5 | 6 | if (!requireNamespace("igraph", quietly = TRUE)) 7 | stop("`igraph` package must be installed to use igraphs.", call. = FALSE) 8 | 9 | if (is.null(igraph::V(graph)$name)) 10 | igraph::V(graph)$name <- as.character(1:igraph::gorder(graph)) 11 | 12 | appr.abstract_graph(graph = graph, seeds = seeds, ...) 13 | } 14 | 15 | check.igraph <- function(graph, nodes) { 16 | 17 | node_names <- names(igraph::V(graph)) 18 | nodes_in_graph <- nodes[nodes %in% node_names] 19 | 20 | nodes_in_graph[igraph::degree(graph, v = nodes_in_graph, mode = "out") > 0] 21 | } 22 | 23 | node_degrees.igraph <- function(graph, nodes) { 24 | list( 25 | in_degree = igraph::degree(graph, v = nodes, mode = "in"), 26 | out_degree = igraph::degree(graph, v = nodes, mode = "out") 27 | ) 28 | } 29 | 30 | # character list of neighboring nodes 31 | # treat directed vs undirected differently? 32 | neighborhood.igraph <- function(graph, node) { 33 | int_node_list <- igraph::ego( 34 | graph, nodes = node, mode = "out", mindist = 1 35 | ) 36 | 37 | nodes <- int_node_list[[1]] 38 | igraph::V(graph)$name[nodes] 39 | } 40 | 41 | -------------------------------------------------------------------------------- /R/graph-rtweet.R: -------------------------------------------------------------------------------- 1 | #' Create an abstract representation of the Twitter friendship graph 2 | #' 3 | #' Signifies that `aPPR` should query the Twitter friendship graph via 4 | #' `rtweet`. 5 | #' 6 | #' @inheritParams rtweet::get_friends 7 | #' 8 | #' @export 9 | rtweet_graph <- function(retryonratelimit = TRUE, verbose = TRUE, n = 5000) { 10 | 11 | if (!requireNamespace("rtweet", quietly = TRUE)) { 12 | stop( 13 | "`rtweet` package must be installed to use `rtweet_graph()`", 14 | call. = FALSE 15 | ) 16 | } 17 | 18 | agraph <- abstract_graph( 19 | "rtweet_graph", 20 | retryonratelimit = retryonratelimit, 21 | verbose = verbose, 22 | max_friends = n 23 | ) 24 | 25 | agraph 26 | } 27 | 28 | #' @rdname appr 29 | #' @export 30 | appr.rtweet_graph <- function(graph, seeds, ...) { 31 | 32 | seed_data <- rtweet::lookup_users( 33 | seeds, 34 | retryonratelimit = graph$retryonratelimit, 35 | verbose = graph$verbose 36 | ) 37 | 38 | if (any(seed_data$protected)) { 39 | stop("Seed nodes should not be protected Twitter accounts.", call. = FALSE) 40 | } 41 | 42 | # convert seeds, potentially passed as screen names, to user ids 43 | seeds <- seed_data$id_str 44 | 45 | NextMethod() 46 | } 47 | 48 | # return character vector of all good nodes in the batch 49 | #' @importFrom glue glue 50 | check.rtweet_graph <- function(graph, nodes) { 51 | 52 | logger::log_debug(glue("Checking nodes")) 53 | logger::log_trace(glue("Checking nodes: {nodes}")) 54 | 55 | if (length(nodes) < 1) 56 | return(character(0)) 57 | 58 | node_data <- rtweet::lookup_users( 59 | nodes, 60 | retryonratelimit = graph$retryonratelimit, 61 | verbose = graph$verbose 62 | ) 63 | 64 | if (is.null(node_data) || nrow(node_data) < 1) 65 | return(character(0)) 66 | 67 | good_nodes <- !node_data$protected & node_data$friends_count > 0 68 | 69 | logger::log_debug(glue("Done checking nodes")) 70 | 71 | node_data$id_str[good_nodes] 72 | } 73 | 74 | node_degrees.rtweet_graph <- function(graph, nodes) { 75 | 76 | logger::log_debug(glue("Getting node degrees")) 77 | logger::log_trace(glue("Getting node degrees for node: {nodes}")) 78 | 79 | # assumes that you want any errors / empty rows when accessing this 80 | # data, i.e. that the nodes have already been checked 81 | 82 | node_data <- rtweet::lookup_users( 83 | nodes, 84 | retryonratelimit = graph$retryonratelimit, 85 | verbose = graph$verbose 86 | ) 87 | 88 | logger::log_debug(glue("Done getting node degrees")) 89 | 90 | list( 91 | in_degree = node_data$followers_count, 92 | out_degree = node_data$friends_count 93 | ) 94 | } 95 | 96 | neighborhood.rtweet_graph <- function(graph, node) { 97 | 98 | logger::log_debug(glue("Getting neighborhood: {node}")) 99 | 100 | # if a user doesn't follow anyone, safe_get_friends returns an empty 101 | # tibble, but instead it should return an empty character vector? 102 | friends <- rtweet::get_friends( 103 | users = node, 104 | n = graph$max_friends, 105 | retryonratelimit = graph$retryonratelimit, 106 | verbose = graph$verbose 107 | ) 108 | 109 | logger::log_debug(glue("Done getting neighborhood")) 110 | 111 | if (nrow(friends) < 1) character(0) else friends$to_id 112 | } 113 | 114 | -------------------------------------------------------------------------------- /R/tracker.R: -------------------------------------------------------------------------------- 1 | #' R6 class to manage personalized pagerank calculations 2 | #' 3 | #' @importFrom R6 R6Class 4 | #' 5 | Tracker <- R6Class("Tracker", list( 6 | 7 | #' @field seeds A character vector of the seed nodes. 8 | seeds = character(0), 9 | 10 | #' @field path A character vector of nodes whose neighborhoods we 11 | #' examined. 12 | path = character(0), 13 | 14 | #' @field stats A [tibble::tibble()] with one row for each visited 15 | #' node and the following columns: 16 | #' 17 | #' - `name`: Name of a node (character). 18 | #' - `r`: Current estimate of residual per out-degree for a node. 19 | #' - `p`: Current estimate of the pagerank for a node. 20 | #' - `in_degree`: Number of incoming edges to a node. 21 | #' - `out_degree`: Number of outcoming edges from a node. 22 | #' 23 | stats = NULL, 24 | 25 | #' @field failed A character vector of nodes that could not be visited. 26 | failed = character(0), 27 | 28 | #' @field graph An abstract graph object. 29 | graph = NULL, 30 | 31 | #' @field alpha Teleportation constant from Algorithm 3. 32 | alpha = numeric(0), 33 | 34 | #' @field alpha_prime Transformed teleportation constant from Algorithm 3. 35 | alpha_prime = numeric(0), 36 | 37 | #' @field epsilon Error tolerance. 38 | epsilon = numeric(0), 39 | 40 | #' @field max_visits Maximum number of nodes to visit before terminating. 41 | max_visits = integer(0), 42 | 43 | #' @field tau Regularization parameter used in Algorithm 4. 44 | tau = numeric(0), 45 | 46 | #' @description 47 | #' 48 | #' Create a new Tracker object. 49 | #' 50 | #' @param graph See [appr()]. 51 | #' @param alpha See [appr()]. 52 | #' @param epsilon See [appr()]. 53 | #' @param tau See [appr()]. 54 | #' @param max_visits See [appr()]. 55 | #' 56 | #' @return A new `Tracker` object. 57 | #' 58 | #' @importFrom tibble tibble 59 | #' 60 | initialize = function(graph, alpha, epsilon, tau, max_visits) { 61 | 62 | self$graph <- graph 63 | self$alpha <- alpha 64 | self$alpha_prime <- alpha / (2 - alpha) 65 | self$epsilon <- epsilon 66 | self$tau <- tau 67 | self$max_visits <- max_visits 68 | 69 | self$stats <- tibble::tibble( 70 | name = character(0), 71 | regularized = numeric(0), 72 | p = numeric(0), 73 | in_degree = numeric(0), 74 | out_degree = numeric(0), 75 | degree_adjusted = numeric(0), 76 | r = numeric(0) 77 | ) 78 | }, 79 | 80 | #' @description 81 | #' 82 | #' Print the tibble containing the current state of the pagerank 83 | #' calculation. 84 | #' 85 | print = function() { 86 | 87 | cat("Personalized PageRank Approximator\n") 88 | cat("----------------------------------\n\n") 89 | 90 | cat(glue(" - number of seeds: {length(self$seeds)}\n", .trim = FALSE)) 91 | cat(glue(" - unique nodes visited so far: {length(unique(self$path))} out of maximum of {self$max_visits}\n", .trim = FALSE)) 92 | cat(glue(" - total visits so far: {length(self$path)}\n", .trim = FALSE)) 93 | cat(glue(" - bad nodes so far: {length(self$failed)}\n\n", .trim = FALSE)) 94 | 95 | cat(glue(" - teleportation constant (alpha): {self$alpha}\n", .trim = FALSE)) 96 | cat(glue(" - desired approximation error (epsilon): {self$epsilon}\n", .trim = FALSE)) 97 | cat(glue(" - achieved bound on approximation error: {self$current_approximation_error()}\n", .trim = FALSE)) 98 | cat(glue(" - length of to visit list: {length(self$remaining())}\n\n", .trim = FALSE)) 99 | 100 | cat(glue("PPR table (see $stats field):\n\n")) 101 | 102 | print(self$stats) 103 | invisible(self) 104 | }, 105 | 106 | #' @description 107 | #' 108 | #' Determine nodes that need to be visited. Note that, 109 | #' if there is a node with zero out degree, you will never 110 | #' leave from that node. So it is important to make sure 111 | #' we never add nodes with zero out degree into the tracker. 112 | #' 113 | #' @return A character vector of node names with current residuals 114 | #' greater than `epsilon`. 115 | #' 116 | remaining = function() { 117 | 118 | # when we initialize, we need to initialize to the seeds 119 | # here we check for initialization by consider the path 120 | # of nodes we've visited so far. it's very important that 121 | # we do not populate `path` when adding the seeds 122 | if (length(self$path) < 1) 123 | return(self$seeds) 124 | 125 | self$stats[self$stats$r > self$epsilon * self$stats$out_degree, ]$name 126 | }, 127 | 128 | #' @description 129 | #' 130 | #' Determine current quality of approximation. 131 | #' 132 | #' @return A numeric vector of length one with the current worst 133 | #' error bound. 134 | #' 135 | current_approximation_error = function() { 136 | 137 | nodewise_approx_error <- self$stats$r / self$stats$out_degree 138 | max(nodewise_approx_error) 139 | }, 140 | 141 | #' @description 142 | #' 143 | #' Check if there is already a row for a particular node 144 | #' 145 | #' @param nodes Character name of node(s) in the graph. 146 | #' 147 | #' @return `TRUE` if there is a row for `node`, `FALSE` if there 148 | #' is not a row for `node`. 149 | #' 150 | in_tracker = function(nodes) { 151 | nodes %in% self$stats$name 152 | }, 153 | 154 | #' @description 155 | #' 156 | #' Check if we previously failed to visit a node 157 | #' 158 | #' @param node Character name of a node in the graph. 159 | #' 160 | #' @return `TRUE` if we failed to visit `node`, `FALSE` otherwise. 161 | #' Note that this function will return `FALSE` if `node` is new 162 | #' and we haven't seen it before. 163 | #' 164 | in_failed = function(node) { 165 | node %in% self$failed 166 | }, 167 | 168 | #' @description 169 | #' 170 | #' Create an entry for `node` in the tracker. Assumes that 171 | #' `node` is not in the tracker yet, and does not check if 172 | #' this is the case. 173 | #' 174 | #' @param seeds The name of the node in the graph as a length 1 175 | #' character vector. 176 | #' 177 | #' @param preference TODO: recall what on earth this is. 178 | #' 179 | add_seed = function(seeds, preference) { 180 | self$seeds <- c(self$seeds, seeds) 181 | self$add_nodes(nodes = seeds, preference = preference) 182 | }, 183 | 184 | #' @description 185 | #' 186 | #' TODO 187 | #' 188 | #' @param node The name of the node in the graph as a length 1 189 | #' character vector. 190 | #' 191 | add_to_path = function(node) { 192 | self$path <- c(self$path, node) 193 | }, 194 | 195 | #' @description 196 | #' 197 | #' Create an entry for `node` in the tracker. Assumes that 198 | #' `node` is not in the tracker yet, and does not check if 199 | #' this is the case. 200 | #' 201 | #' @param nodes The name(s) of node(s) in the graph as a character vector. 202 | #' 203 | #' @param preference TODO: recall what on earth this is. 204 | #' 205 | add_nodes = function(nodes, preference = 0) { 206 | 207 | log_trace(glue("Adding node(s) to tracker: {nodes}")) 208 | 209 | degree <- node_degrees(self$graph, nodes) 210 | 211 | self$stats <- tibble::add_row( 212 | self$stats, 213 | name = nodes, 214 | regularized = NA_real_, 215 | p = 0, 216 | in_degree = degree$in_degree, 217 | out_degree = degree$out_degree, 218 | degree_adjusted = NA_real_, 219 | r = preference 220 | ) 221 | 222 | }, 223 | 224 | #' @description 225 | #' 226 | #' Add `node` to the list of nodes we failed to visit. 227 | #' Assumes that `node` is not in the failed list yet, and 228 | #' does not check if this is the case. 229 | #' 230 | #' @param nodes The name of the node in the graph as a length 1 231 | #' character vector. 232 | #' 233 | add_failed = function(nodes) { 234 | self$failed <- c(self$failed, nodes) 235 | }, 236 | 237 | #' @description 238 | #' 239 | #' Update the estimate of the personalized pagerank for a given node 240 | #' 241 | #' @param node Character name of a node in the graph. 242 | #' 243 | update_p = function(node) { 244 | 245 | node_index <- which(self$stats$name == node) 246 | self$stats[[node_index, "p"]] <- self$stats[[node_index, "p"]] + 247 | self$alpha_prime * self$stats[[node_index, "r"]] 248 | }, 249 | 250 | #' @description 251 | #' 252 | #' Update the residual of a *good* node in the neighborhood of 253 | #' the current node, adding it to the tracker if necessary 254 | #' 255 | #' @param u Character name of the node we are currently visiting. 256 | #' @param v Names of neighbors of `u` as a character vector. Can 257 | #' contain multiple elements. Can also contain zero elements. 258 | #' 259 | update_r_neighbor = function(u, v) { 260 | 261 | log_trace(glue("update_r_neighbor({u}, {v})")) 262 | 263 | stopifnot(length(u) == 1) 264 | 265 | if (length(v) < 1) 266 | return(invisible(NULL)) 267 | 268 | new_nodes <- v[!self$in_tracker(v)] 269 | 270 | if (length(new_nodes) > 0) 271 | self$add_nodes(new_nodes) 272 | 273 | u_index <- which(self$stats$name == u) 274 | v_index <- match(v, self$stats$name) 275 | 276 | self$stats[v_index, "r"] <- self$stats[v_index, "r"] + 277 | (1 - self$alpha_prime) * self$stats[[u_index, "r"]] / 278 | (2 * self$stats[[u_index, "out_degree"]]) 279 | 280 | }, 281 | 282 | #' @description 283 | #' 284 | #' Update the residual of current node 285 | #' 286 | #' @param node Character name of the node we are currently visiting. 287 | #' 288 | update_r_self = function(node) { 289 | node_index <- which(self$stats$name == node) 290 | self$stats[[node_index, "r"]] <- (1 - self$alpha_prime) * 291 | self$stats[[node_index, "r"]] / 2 292 | }, 293 | 294 | #' @description 295 | #' 296 | #' Compute the degree-adjusted and regularized variants of personalized 297 | #' PageRank as in Algorithm 4, based on the outputs of Algorithm 3. 298 | #' 299 | #' @param node Character name of the node we are currently visiting. 300 | #' 301 | regularize = function() { 302 | 303 | if (is.null(self$tau)) { 304 | tau <- mean(self$stats$in_degree) 305 | } 306 | 307 | # might divide by 0 here 308 | self$stats$degree_adjusted <- self$stats$p / self$stats$in_degree 309 | self$stats$regularized <- self$stats$p / (self$stats$in_degree + tau) 310 | }, 311 | 312 | #' @description 313 | #' 314 | #' Main driver function to perform the computations outlined in 315 | #' Algorithm 3. 316 | #' 317 | #' @param node Character name of the node we are currently visiting. 318 | #' 319 | calculate_ppr = function() { 320 | 321 | log_info("Approximating PPR ...") 322 | 323 | remaining <- self$remaining() 324 | unique_visits_so_far <- length(unique(self$path)) 325 | 326 | log_info(glue( 327 | "Visits: {length(self$path)} total / ", 328 | "{unique_visits_so_far} unique (max {self$max_visits}) / ", 329 | "{length(remaining)} to visit / ", 330 | "current epsilon: {self$current_approximation_error()}.", 331 | .trim = FALSE 332 | )) 333 | 334 | while (length(remaining) > 0) { 335 | 336 | if (unique_visits_so_far >= self$max_visits) { 337 | warning("Maximum visits reached. Finishing aPPR calculation early.", call. = FALSE) 338 | break 339 | } 340 | 341 | u <- if (length(remaining) == 1) remaining else sample(remaining, size = 1) 342 | 343 | log_trace(glue("Visting {u}")) 344 | 345 | self$update_p(u) 346 | 347 | # here we come into contact with reality and must depart from the 348 | # warm embrace of algorithm 3 349 | 350 | # this is where we learn about new nodes. there are two kinds of new 351 | # nodes: "good" nodes that we can visit, and "bad" nodes that we can't 352 | # visit, such as protected Twitter accounts or nodes that the API fails 353 | # to get for some reason. we want to: 354 | # 355 | # - update the good nodes are we typically would 356 | # - pretend the bad nodes don't exist 357 | # 358 | # also note that we only want to *check* each node once 359 | 360 | neighbors <- memo_neighborhood(self$graph, u) 361 | 362 | self$add_to_path(u) 363 | 364 | # first deal with the good neighbors we've already seen all 365 | # at once 366 | 367 | known_good <- neighbors[self$in_tracker(neighbors)] 368 | known_bad <- neighbors[self$in_failed(neighbors)] 369 | 370 | unknown <- setdiff(neighbors, c(known_good, known_bad)) 371 | 372 | new_good <- check(self$graph, unknown) 373 | new_bad <- setdiff(unknown, new_good) 374 | 375 | log_debug( 376 | glue( 377 | "{length(known_good)} known good / ", 378 | "{length(known_bad)} known bad / ", 379 | "{length(new_good)} new good / ", 380 | "{length(new_bad)} new bad", 381 | sep = " " 382 | ) 383 | ) 384 | 385 | log_trace(glue("known good: {known_good}")) 386 | log_trace(glue("known bad: {known_bad}")) 387 | log_trace(glue("new good: {new_good}")) 388 | log_trace(glue("new bad: {new_bad}")) 389 | 390 | self$add_failed(new_bad) 391 | self$update_r_neighbor(u, known_good) 392 | self$update_r_neighbor(u, new_good) 393 | 394 | self$update_r_self(u) 395 | 396 | remaining <- self$remaining() 397 | unique_visits_so_far <- length(unique(self$path)) 398 | 399 | log_info(glue( 400 | "Visits: {length(self$path)} total / ", 401 | "{unique_visits_so_far} unique (max {self$max_visits}) / ", 402 | "{length(remaining)} to visit / ", 403 | "current epsilon: {self$current_approximation_error()}.", 404 | .trim = FALSE 405 | )) 406 | } 407 | 408 | log_info("Approximating PPR ... done") 409 | } 410 | )) 411 | -------------------------------------------------------------------------------- /R/update.R: -------------------------------------------------------------------------------- 1 | #' Update a Tracker object 2 | #' 3 | #' Typically because results are insufficiently precise. 4 | #' 5 | #' At the moment, only supports changing `epsilon`. If there is interest, 6 | #' we can consider allowing updates to `tau`, `alpha` and `seeds` in the 7 | #' future. 8 | #' 9 | #' @param object The `Tracker` object to update. 10 | #' 11 | #' @inheritParams appr 12 | #' 13 | #' @return A new `Tracker` object with a new value of `epsilon`. 14 | #' @export 15 | #' 16 | update.Tracker <- function(object, ..., epsilon, max_visits) { 17 | 18 | object$epsilon <- epsilon 19 | object$max_visits <- max_visits 20 | object$calculate_ppr() 21 | object$regularize() 22 | object 23 | } 24 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%", 13 | error = TRUE 14 | ) 15 | ``` 16 | 17 | # aPPR 18 | 19 | 20 | [![R-CMD-check](https://github.com/RoheLab/aPPR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/RoheLab/aPPR/actions/workflows/R-CMD-check.yaml) 21 | [![Codecov test coverage](https://codecov.io/gh/RoheLab/aPPR/branch/main/graph/badge.svg)](https://app.codecov.io/gh/RoheLab/aPPR?branch=main) 22 | 23 | 24 | 25 | `aPPR` helps you calculate approximate personalized pageranks from large graphs, including those that can only be queried via an API. `aPPR` additionally performs degree correction and regularization, allowing you to recover blocks from stochastic blockmodels. 26 | 27 | To learn more about `aPPR` you can: 28 | 29 | 1. Glance through slides from the [JSM2021](https://github.com/alexpghayes/JSM2021) talk 30 | 2. Read the accompanying [paper][chen] 31 | 32 | ### Installation 33 | 34 | You can install the development version from [GitHub](https://github.com/) with: 35 | 36 | ``` r 37 | # install.packages("devtools") 38 | devtools::install_github("RoheLab/aPPR") 39 | ``` 40 | 41 | ### Find the personalized pagerank of a node in an `igraph` graph 42 | 43 | ```{r igraph-example, message = FALSE} 44 | library(aPPR) 45 | library(igraph) 46 | 47 | set.seed(27) 48 | 49 | erdos_renyi_graph <- sample_gnp(n = 100, p = 0.5) 50 | 51 | erdos_tracker <- appr( 52 | erdos_renyi_graph, # the graph to work with 53 | seeds = "5", # name of seed node (character) 54 | epsilon = 0.0005 # desired approximation quality (see ?appr) 55 | ) 56 | 57 | erdos_tracker 58 | ``` 59 | 60 | You can access the Personalized PageRanks themselves via the `stats` field of `Tracker` objects. 61 | 62 | ```{r} 63 | erdos_tracker$stats 64 | ``` 65 | 66 | Sometimes you may wish to limit computation time by limiting the number of nodes to visit, which you can do as follows: 67 | 68 | ```{r igraph-example2} 69 | limited_visits_tracker <- appr( 70 | erdos_renyi_graph, 71 | seeds = "5", 72 | epsilon = 1e-10, 73 | max_visits = 20 # max unique nodes to visit during approximation 74 | ) 75 | 76 | limited_visits_tracker 77 | ``` 78 | 79 | ### Find the personalized pagerank of a Twitter user using `rtweet` 80 | 81 | ```{r rtweet-example} 82 | ftrevorc_ppr <- appr( 83 | rtweet_graph(), 84 | "ftrevorc", 85 | epsilon = 1e-4, 86 | max_visits = 5 87 | ) 88 | 89 | ftrevorc_ppr 90 | ``` 91 | 92 | ### Logging 93 | 94 | `aPPR` uses [`logger`](https://daroczig.github.io/logger/) for displaying information to the user. By default, `aPPR` is quite verbose. You can control verbosity by loading `logger` and setting the logging threshold. 95 | 96 | ```{r logging-example-1, eval = FALSE} 97 | library(logger) 98 | 99 | # hide basically all messages (not recommended) 100 | log_threshold(FATAL, namespace = "aPPR") 101 | 102 | appr( 103 | erdos_renyi_graph, # the graph to work with 104 | seeds = "5", # name of seed node (character) 105 | epsilon = 0.0005 # desired approximation quality (see ?appr) 106 | ) 107 | ``` 108 | 109 | If you submit a bug report, please please please include a log file using the TRACE threshold. You can set up this kind of detailed logging via the following: 110 | 111 | ```{r log-file-example, eval = FALSE} 112 | 113 | set.seed(528491) # be sure to set seed for bug reports 114 | 115 | log_appender( 116 | appender_file( 117 | "/path/to/logfile.log" ## TODO: choose a path to log to 118 | ), 119 | namespace = "aPPR" 120 | ) 121 | 122 | log_threshold(TRACE, namespace = "aPPR") 123 | 124 | tracker <- appr( 125 | rtweet_graph(), 126 | seed = c("hadleywickham", "gvanrossum"), 127 | epsilon = 1e-6 128 | ) 129 | ``` 130 | 131 | ### Ethical considerations 132 | 133 | People have a right to choose how public and discoverable their information is. `aPPR` will often lead you to accounts that interesting, but also small and out of sight. Do not change the public profile or attention towards these the people running these accounts, or any other accounts, without their permission. 134 | 135 | ### References 136 | 137 | 1. Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349. [arxiv][chen] 138 | 139 | 2. Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44. 140 | 141 | [chen]: https://arxiv.org/abs/1910.12937 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # aPPR 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/RoheLab/aPPR/workflows/R-CMD-check/badge.svg)](https://github.com/RoheLab/aPPR/actions) 9 | [![Codecov test 10 | coverage](https://codecov.io/gh/RoheLab/aPPR/branch/main/graph/badge.svg)](https://app.codecov.io/gh/RoheLab/aPPR?branch=main) 11 | 12 | 13 | `aPPR` helps you calculate approximate personalized pageranks from large 14 | graphs, including those that can only be queried via an API. `aPPR` 15 | additionally performs degree correction and regularization, allowing you 16 | to recover blocks from stochastic blockmodels. 17 | 18 | To learn more about `aPPR` you can: 19 | 20 | 1. Glance through slides from the 21 | [JSM2021](https://github.com/alexpghayes/JSM2021) talk 22 | 2. Read the accompanying [paper](https://arxiv.org/abs/1910.12937) 23 | 24 | ### Installation 25 | 26 | You can install the development version from 27 | [GitHub](https://github.com/) with: 28 | 29 | ``` r 30 | # install.packages("devtools") 31 | devtools::install_github("RoheLab/aPPR") 32 | ``` 33 | 34 | ### Find the personalized pagerank of a node in an `igraph` graph 35 | 36 | ``` r 37 | library(aPPR) 38 | library(igraph) 39 | 40 | set.seed(27) 41 | 42 | erdos_renyi_graph <- sample_gnp(n = 100, p = 0.5) 43 | 44 | erdos_tracker <- appr( 45 | erdos_renyi_graph, # the graph to work with 46 | seeds = "5", # name of seed node (character) 47 | epsilon = 0.0005 # desired approximation quality (see ?appr) 48 | ) 49 | 50 | erdos_tracker 51 | #> Personalized PageRank Approximator 52 | #> ---------------------------------- 53 | #> 54 | #> - number of seeds: 1 55 | #> - visits so far: 5 56 | #> - unique nodes visited so far: 1 out of maximum of Inf 57 | #> - bad nodes so far: 0 58 | #> 59 | #> - teleportation constant (alpha): 0.15 60 | #> - desired approximation error (epsilon): 5e-04 61 | #> - achieved bound on approximation error: 0.000416297883029663 62 | #> - current length of to-visit list: 0 63 | #> 64 | #> PPR table (see $stats field): 65 | #> # A tibble: 51 × 7 66 | #> name r p in_degree out_degree degree_adjusted regularized 67 | #> 68 | #> 1 5 0.0205 0.147 50 50 0.00294 0.00147 69 | #> 2 3 0.0167 0 51 51 0 0 70 | #> 3 6 0.0167 0 59 59 0 0 71 | #> 4 8 0.0167 0 41 41 0 0 72 | #> 5 15 0.0167 0 46 46 0 0 73 | #> 6 16 0.0167 0 52 52 0 0 74 | #> 7 17 0.0167 0 48 48 0 0 75 | #> 8 19 0.0167 0 54 54 0 0 76 | #> 9 20 0.0167 0 51 51 0 0 77 | #> 10 21 0.0167 0 55 55 0 0 78 | #> # … with 41 more rows 79 | ``` 80 | 81 | You can access the Personalized PageRanks themselves via the `stats` 82 | field of `Tracker` objects. 83 | 84 | ``` r 85 | erdos_tracker$stats 86 | #> # A tibble: 51 × 7 87 | #> name r p in_degree out_degree degree_adjusted regularized 88 | #> 89 | #> 1 5 0.0205 0.147 50 50 0.00294 0.00147 90 | #> 2 3 0.0167 0 51 51 0 0 91 | #> 3 6 0.0167 0 59 59 0 0 92 | #> 4 8 0.0167 0 41 41 0 0 93 | #> 5 15 0.0167 0 46 46 0 0 94 | #> 6 16 0.0167 0 52 52 0 0 95 | #> 7 17 0.0167 0 48 48 0 0 96 | #> 8 19 0.0167 0 54 54 0 0 97 | #> 9 20 0.0167 0 51 51 0 0 98 | #> 10 21 0.0167 0 55 55 0 0 99 | #> # … with 41 more rows 100 | ``` 101 | 102 | Sometimes you may wish to limit computation time by limiting the number 103 | of nodes to visit, which you can do as follows: 104 | 105 | ``` r 106 | limited_visits_tracker <- appr( 107 | erdos_renyi_graph, 108 | seeds = "5", 109 | epsilon = 1e-10, 110 | max_visits = 20 # max unique nodes to visit during approximation 111 | ) 112 | #> Warning: Maximum visits reached. Finishing aPPR calculation early. 113 | limited_visits_tracker 114 | #> Personalized PageRank Approximator 115 | #> ---------------------------------- 116 | #> 117 | #> - number of seeds: 1 118 | #> - visits so far: 22 119 | #> - unique nodes visited so far: 20 out of maximum of 20 120 | #> - bad nodes so far: 0 121 | #> 122 | #> - teleportation constant (alpha): 0.15 123 | #> - desired approximation error (epsilon): 1e-10 124 | #> - achieved bound on approximation error: 0.00423832387327568 125 | #> - current length of to-visit list: 100 126 | #> 127 | #> PPR table (see $stats field): 128 | #> # A tibble: 100 × 7 129 | #> name r p in_degree out_degree degree_adjusted regularized 130 | #> 131 | #> 1 5 0.212 0.118 50 50 0.00237 0.00119 132 | #> 2 3 0.0140 0 51 51 0 0 133 | #> 3 6 0.0140 0 59 59 0 0 134 | #> 4 8 0.0140 0 41 41 0 0 135 | #> 5 15 0.0136 0 46 46 0 0 136 | #> 6 16 0.0138 0 52 52 0 0 137 | #> 7 17 0.0138 0 48 48 0 0 138 | #> 8 19 0.0137 0 54 54 0 0 139 | #> 9 20 0.0135 0 51 51 0 0 140 | #> 10 21 0.0138 0 55 55 0 0 141 | #> # … with 90 more rows 142 | ``` 143 | 144 | ### Find the personalized pagerank of a Twitter user using `rtweet` 145 | 146 | ``` r 147 | ftrevorc_ppr <- appr( 148 | rtweet_graph(), 149 | "ftrevorc", 150 | epsilon = 1e-4, 151 | max_visits = 5 152 | ) 153 | #> Warning: Maximum visits reached. Finishing aPPR calculation early. 154 | ftrevorc_ppr 155 | #> Personalized PageRank Approximator 156 | #> ---------------------------------- 157 | #> 158 | #> - number of seeds: 1 159 | #> - visits so far: 6 160 | #> - unique nodes visited so far: 5 out of maximum of 5 161 | #> - bad nodes so far: 10 162 | #> 163 | #> - teleportation constant (alpha): 0.15 164 | #> - desired approximation error (epsilon): 1e-04 165 | #> - achieved bound on approximation error: 0.00175980395529336 166 | #> - current length of to-visit list: 5 167 | #> 168 | #> PPR table (see $stats field): 169 | #> # A tibble: 210 × 7 170 | #> name r p in_degree out_degree degree_adjusted regularized 171 | #> 172 | #> 1 7752257741314… 0.211 0.118 69 120 0.00172 5.50e-8 173 | #> 2 17163639 0.00559 0 20033 1596 0 0 174 | #> 3 9381208958721… 0.00559 0 372 179 0 0 175 | #> 4 1359003756063… 0.00559 0 230 116 0 0 176 | #> 5 76228303 0.00559 0 7253 2274 0 0 177 | #> 6 1024298722828… 0.00559 0 382 829 0 0 178 | #> 7 1264590946144… 0.00559 0 116 189 0 0 179 | #> 8 1107711818997… 0.00559 0 3404 410 0 0 180 | #> 9 1217315090 0.00559 0 20660 402 0 0 181 | #> 10 1120701503763… 0.00559 0 354 243 0 0 182 | #> # … with 200 more rows 183 | ``` 184 | 185 | ### Logging 186 | 187 | `aPPR` uses [`logger`](https://daroczig.github.io/logger/) for 188 | displaying information to the user. By default, `aPPR` is quite verbose. 189 | You can control verbosity by loading `logger` and setting the logging 190 | threshold. 191 | 192 | ``` r 193 | library(logger) 194 | 195 | # hide basically all messages (not recommended) 196 | log_threshold(FATAL, namespace = "aPPR") 197 | 198 | appr( 199 | erdos_renyi_graph, # the graph to work with 200 | seeds = "5", # name of seed node (character) 201 | epsilon = 0.0005 # desired approximation quality (see ?appr) 202 | ) 203 | ``` 204 | 205 | If you submit a bug report, please please please include a log file 206 | using the TRACE threshold. You can set up this kind of detailed logging 207 | via the following: 208 | 209 | ``` r 210 | set.seed(528491) # be sure to set seed for bug reports 211 | 212 | log_appender( 213 | appender_file( 214 | "/path/to/logfile.log" ## TODO: choose a path to log to 215 | ), 216 | namespace = "aPPR" 217 | ) 218 | 219 | log_threshold(TRACE, namespace = "aPPR") 220 | 221 | tracker <- appr( 222 | rtweet_graph(), 223 | seed = c("hadleywickham", "gvanrossum"), 224 | epsilon = 1e-6 225 | ) 226 | ``` 227 | 228 | ### Ethical considerations 229 | 230 | People have a right to choose how public and discoverable their 231 | information is. `aPPR` will often lead you to accounts that interesting, 232 | but also small and out of sight. Do not change the public profile or 233 | attention towards these the people running these accounts, or any other 234 | accounts, without their permission. 235 | 236 | ### References 237 | 238 | 1. Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from 239 | Massive Block Model Graphs with Personalized PageRank.” Journal of 240 | the Royal Statistical Society: Series B (Statistical Methodology) 241 | 82, no. 1 (February 2020): 99–126. 242 | . 243 | [arxiv](https://arxiv.org/abs/1910.12937) 244 | 245 | 2. Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning 246 | Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on 247 | Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, 248 | USA: IEEE, 2006. . 249 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | authors: 2 | Alex Hayes: 3 | href: https://alexpghayes.com 4 | 5 | development: 6 | mode: auto 7 | 8 | template: 9 | bootstrap: 5 10 | params: 11 | bootswatch: flatly 12 | 13 | 14 | reference: 15 | - title: "Define and interact with graphs" 16 | contents: 17 | - abstract_graph 18 | - node_degrees 19 | - check 20 | - neighborhood 21 | - title: "Compute general aPPR results" 22 | contents: 23 | - appr 24 | - contains("Tracker") 25 | - title: "Compute Personalized PageRanks of Twitter users" 26 | contents: 27 | - contains("rtweet") 28 | -------------------------------------------------------------------------------- /aPPR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## R CMD check results 2 | 3 | 0 errors | 0 warnings | 1 note 4 | 5 | New submission 6 | 7 | Version contains large components (0.0.0.9102) 8 | 9 | Possibly mis-spelled words in DESCRIPTION: 10 | PageRank (2:33, 11:66) 11 | 12 | Unknown, possibly mis-spelled, fields in DESCRIPTION: 13 | ‘Remotes’ 14 | 15 | Package has a VignetteBuilder field but no prebuilt vignette index. 16 | 17 | * This is a new release. 18 | -------------------------------------------------------------------------------- /man/Tracker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tracker.R 3 | \name{Tracker} 4 | \alias{Tracker} 5 | \title{R6 class to manage personalized pagerank calculations} 6 | \description{ 7 | R6 class to manage personalized pagerank calculations 8 | 9 | R6 class to manage personalized pagerank calculations 10 | } 11 | \section{Public fields}{ 12 | \if{html}{\out{
}} 13 | \describe{ 14 | \item{\code{seeds}}{A character vector of the seed nodes.} 15 | 16 | \item{\code{path}}{A character vector of nodes whose neighborhoods we 17 | examined.} 18 | 19 | \item{\code{stats}}{A \code{\link[tibble:tibble]{tibble::tibble()}} with one row for each visited 20 | node and the following columns: 21 | \itemize{ 22 | \item \code{name}: Name of a node (character). 23 | \item \code{r}: Current estimate of residual per out-degree for a node. 24 | \item \code{p}: Current estimate of the pagerank for a node. 25 | \item \code{in_degree}: Number of incoming edges to a node. 26 | \item \code{out_degree}: Number of outcoming edges from a node. 27 | }} 28 | 29 | \item{\code{failed}}{A character vector of nodes that could not be visited.} 30 | 31 | \item{\code{graph}}{An abstract graph object.} 32 | 33 | \item{\code{alpha}}{Teleportation constant from Algorithm 3.} 34 | 35 | \item{\code{alpha_prime}}{Transformed teleportation constant from Algorithm 3.} 36 | 37 | \item{\code{epsilon}}{Error tolerance.} 38 | 39 | \item{\code{max_visits}}{Maximum number of nodes to visit before terminating.} 40 | 41 | \item{\code{tau}}{Regularization parameter used in Algorithm 4.} 42 | } 43 | \if{html}{\out{
}} 44 | } 45 | \section{Methods}{ 46 | \subsection{Public methods}{ 47 | \itemize{ 48 | \item \href{#method-Tracker-new}{\code{Tracker$new()}} 49 | \item \href{#method-Tracker-print}{\code{Tracker$print()}} 50 | \item \href{#method-Tracker-remaining}{\code{Tracker$remaining()}} 51 | \item \href{#method-Tracker-current_approximation_error}{\code{Tracker$current_approximation_error()}} 52 | \item \href{#method-Tracker-in_tracker}{\code{Tracker$in_tracker()}} 53 | \item \href{#method-Tracker-in_failed}{\code{Tracker$in_failed()}} 54 | \item \href{#method-Tracker-add_seed}{\code{Tracker$add_seed()}} 55 | \item \href{#method-Tracker-add_to_path}{\code{Tracker$add_to_path()}} 56 | \item \href{#method-Tracker-add_nodes}{\code{Tracker$add_nodes()}} 57 | \item \href{#method-Tracker-add_failed}{\code{Tracker$add_failed()}} 58 | \item \href{#method-Tracker-update_p}{\code{Tracker$update_p()}} 59 | \item \href{#method-Tracker-update_r_neighbor}{\code{Tracker$update_r_neighbor()}} 60 | \item \href{#method-Tracker-update_r_self}{\code{Tracker$update_r_self()}} 61 | \item \href{#method-Tracker-regularize}{\code{Tracker$regularize()}} 62 | \item \href{#method-Tracker-calculate_ppr}{\code{Tracker$calculate_ppr()}} 63 | \item \href{#method-Tracker-clone}{\code{Tracker$clone()}} 64 | } 65 | } 66 | \if{html}{\out{
}} 67 | \if{html}{\out{}} 68 | \if{latex}{\out{\hypertarget{method-Tracker-new}{}}} 69 | \subsection{Method \code{new()}}{ 70 | Create a new Tracker object. 71 | \subsection{Usage}{ 72 | \if{html}{\out{
}}\preformatted{Tracker$new(graph, alpha, epsilon, tau, max_visits)}\if{html}{\out{
}} 73 | } 74 | 75 | \subsection{Arguments}{ 76 | \if{html}{\out{
}} 77 | \describe{ 78 | \item{\code{graph}}{See \code{\link[=appr]{appr()}}.} 79 | 80 | \item{\code{alpha}}{See \code{\link[=appr]{appr()}}.} 81 | 82 | \item{\code{epsilon}}{See \code{\link[=appr]{appr()}}.} 83 | 84 | \item{\code{tau}}{See \code{\link[=appr]{appr()}}.} 85 | 86 | \item{\code{max_visits}}{See \code{\link[=appr]{appr()}}.} 87 | } 88 | \if{html}{\out{
}} 89 | } 90 | \subsection{Returns}{ 91 | A new \code{Tracker} object. 92 | } 93 | } 94 | \if{html}{\out{
}} 95 | \if{html}{\out{}} 96 | \if{latex}{\out{\hypertarget{method-Tracker-print}{}}} 97 | \subsection{Method \code{print()}}{ 98 | Print the tibble containing the current state of the pagerank 99 | calculation. 100 | \subsection{Usage}{ 101 | \if{html}{\out{
}}\preformatted{Tracker$print()}\if{html}{\out{
}} 102 | } 103 | 104 | } 105 | \if{html}{\out{
}} 106 | \if{html}{\out{}} 107 | \if{latex}{\out{\hypertarget{method-Tracker-remaining}{}}} 108 | \subsection{Method \code{remaining()}}{ 109 | Determine nodes that need to be visited. Note that, 110 | if there is a node with zero out degree, you will never 111 | leave from that node. So it is important to make sure 112 | we never add nodes with zero out degree into the tracker. 113 | \subsection{Usage}{ 114 | \if{html}{\out{
}}\preformatted{Tracker$remaining()}\if{html}{\out{
}} 115 | } 116 | 117 | \subsection{Returns}{ 118 | A character vector of node names with current residuals 119 | greater than \code{epsilon}. 120 | } 121 | } 122 | \if{html}{\out{
}} 123 | \if{html}{\out{}} 124 | \if{latex}{\out{\hypertarget{method-Tracker-current_approximation_error}{}}} 125 | \subsection{Method \code{current_approximation_error()}}{ 126 | Determine current quality of approximation. 127 | \subsection{Usage}{ 128 | \if{html}{\out{
}}\preformatted{Tracker$current_approximation_error()}\if{html}{\out{
}} 129 | } 130 | 131 | \subsection{Returns}{ 132 | A numeric vector of length one with the current worst 133 | error bound. 134 | } 135 | } 136 | \if{html}{\out{
}} 137 | \if{html}{\out{}} 138 | \if{latex}{\out{\hypertarget{method-Tracker-in_tracker}{}}} 139 | \subsection{Method \code{in_tracker()}}{ 140 | Check if there is already a row for a particular node 141 | \subsection{Usage}{ 142 | \if{html}{\out{
}}\preformatted{Tracker$in_tracker(nodes)}\if{html}{\out{
}} 143 | } 144 | 145 | \subsection{Arguments}{ 146 | \if{html}{\out{
}} 147 | \describe{ 148 | \item{\code{nodes}}{Character name of node(s) in the graph.} 149 | } 150 | \if{html}{\out{
}} 151 | } 152 | \subsection{Returns}{ 153 | \code{TRUE} if there is a row for \code{node}, \code{FALSE} if there 154 | is not a row for \code{node}. 155 | } 156 | } 157 | \if{html}{\out{
}} 158 | \if{html}{\out{}} 159 | \if{latex}{\out{\hypertarget{method-Tracker-in_failed}{}}} 160 | \subsection{Method \code{in_failed()}}{ 161 | Check if we previously failed to visit a node 162 | \subsection{Usage}{ 163 | \if{html}{\out{
}}\preformatted{Tracker$in_failed(node)}\if{html}{\out{
}} 164 | } 165 | 166 | \subsection{Arguments}{ 167 | \if{html}{\out{
}} 168 | \describe{ 169 | \item{\code{node}}{Character name of a node in the graph.} 170 | } 171 | \if{html}{\out{
}} 172 | } 173 | \subsection{Returns}{ 174 | \code{TRUE} if we failed to visit \code{node}, \code{FALSE} otherwise. 175 | Note that this function will return \code{FALSE} if \code{node} is new 176 | and we haven't seen it before. 177 | } 178 | } 179 | \if{html}{\out{
}} 180 | \if{html}{\out{}} 181 | \if{latex}{\out{\hypertarget{method-Tracker-add_seed}{}}} 182 | \subsection{Method \code{add_seed()}}{ 183 | Create an entry for \code{node} in the tracker. Assumes that 184 | \code{node} is not in the tracker yet, and does not check if 185 | this is the case. 186 | \subsection{Usage}{ 187 | \if{html}{\out{
}}\preformatted{Tracker$add_seed(seeds, preference)}\if{html}{\out{
}} 188 | } 189 | 190 | \subsection{Arguments}{ 191 | \if{html}{\out{
}} 192 | \describe{ 193 | \item{\code{seeds}}{The name of the node in the graph as a length 1 194 | character vector.} 195 | 196 | \item{\code{preference}}{TODO: recall what on earth this is.} 197 | } 198 | \if{html}{\out{
}} 199 | } 200 | } 201 | \if{html}{\out{
}} 202 | \if{html}{\out{}} 203 | \if{latex}{\out{\hypertarget{method-Tracker-add_to_path}{}}} 204 | \subsection{Method \code{add_to_path()}}{ 205 | TODO 206 | \subsection{Usage}{ 207 | \if{html}{\out{
}}\preformatted{Tracker$add_to_path(node)}\if{html}{\out{
}} 208 | } 209 | 210 | \subsection{Arguments}{ 211 | \if{html}{\out{
}} 212 | \describe{ 213 | \item{\code{node}}{The name of the node in the graph as a length 1 214 | character vector.} 215 | } 216 | \if{html}{\out{
}} 217 | } 218 | } 219 | \if{html}{\out{
}} 220 | \if{html}{\out{}} 221 | \if{latex}{\out{\hypertarget{method-Tracker-add_nodes}{}}} 222 | \subsection{Method \code{add_nodes()}}{ 223 | Create an entry for \code{node} in the tracker. Assumes that 224 | \code{node} is not in the tracker yet, and does not check if 225 | this is the case. 226 | \subsection{Usage}{ 227 | \if{html}{\out{
}}\preformatted{Tracker$add_nodes(nodes, preference = 0)}\if{html}{\out{
}} 228 | } 229 | 230 | \subsection{Arguments}{ 231 | \if{html}{\out{
}} 232 | \describe{ 233 | \item{\code{nodes}}{The name(s) of node(s) in the graph as a character vector.} 234 | 235 | \item{\code{preference}}{TODO: recall what on earth this is.} 236 | } 237 | \if{html}{\out{
}} 238 | } 239 | } 240 | \if{html}{\out{
}} 241 | \if{html}{\out{}} 242 | \if{latex}{\out{\hypertarget{method-Tracker-add_failed}{}}} 243 | \subsection{Method \code{add_failed()}}{ 244 | Add \code{node} to the list of nodes we failed to visit. 245 | Assumes that \code{node} is not in the failed list yet, and 246 | does not check if this is the case. 247 | \subsection{Usage}{ 248 | \if{html}{\out{
}}\preformatted{Tracker$add_failed(nodes)}\if{html}{\out{
}} 249 | } 250 | 251 | \subsection{Arguments}{ 252 | \if{html}{\out{
}} 253 | \describe{ 254 | \item{\code{nodes}}{The name of the node in the graph as a length 1 255 | character vector.} 256 | } 257 | \if{html}{\out{
}} 258 | } 259 | } 260 | \if{html}{\out{
}} 261 | \if{html}{\out{}} 262 | \if{latex}{\out{\hypertarget{method-Tracker-update_p}{}}} 263 | \subsection{Method \code{update_p()}}{ 264 | Update the estimate of the personalized pagerank for a given node 265 | \subsection{Usage}{ 266 | \if{html}{\out{
}}\preformatted{Tracker$update_p(node)}\if{html}{\out{
}} 267 | } 268 | 269 | \subsection{Arguments}{ 270 | \if{html}{\out{
}} 271 | \describe{ 272 | \item{\code{node}}{Character name of a node in the graph.} 273 | } 274 | \if{html}{\out{
}} 275 | } 276 | } 277 | \if{html}{\out{
}} 278 | \if{html}{\out{}} 279 | \if{latex}{\out{\hypertarget{method-Tracker-update_r_neighbor}{}}} 280 | \subsection{Method \code{update_r_neighbor()}}{ 281 | Update the residual of a \emph{good} node in the neighborhood of 282 | the current node, adding it to the tracker if necessary 283 | \subsection{Usage}{ 284 | \if{html}{\out{
}}\preformatted{Tracker$update_r_neighbor(u, v)}\if{html}{\out{
}} 285 | } 286 | 287 | \subsection{Arguments}{ 288 | \if{html}{\out{
}} 289 | \describe{ 290 | \item{\code{u}}{Character name of the node we are currently visiting.} 291 | 292 | \item{\code{v}}{Names of neighbors of \code{u} as a character vector. Can 293 | contain multiple elements. Can also contain zero elements.} 294 | } 295 | \if{html}{\out{
}} 296 | } 297 | } 298 | \if{html}{\out{
}} 299 | \if{html}{\out{}} 300 | \if{latex}{\out{\hypertarget{method-Tracker-update_r_self}{}}} 301 | \subsection{Method \code{update_r_self()}}{ 302 | Update the residual of current node 303 | \subsection{Usage}{ 304 | \if{html}{\out{
}}\preformatted{Tracker$update_r_self(node)}\if{html}{\out{
}} 305 | } 306 | 307 | \subsection{Arguments}{ 308 | \if{html}{\out{
}} 309 | \describe{ 310 | \item{\code{node}}{Character name of the node we are currently visiting.} 311 | } 312 | \if{html}{\out{
}} 313 | } 314 | } 315 | \if{html}{\out{
}} 316 | \if{html}{\out{}} 317 | \if{latex}{\out{\hypertarget{method-Tracker-regularize}{}}} 318 | \subsection{Method \code{regularize()}}{ 319 | Compute the degree-adjusted and regularized variants of personalized 320 | PageRank as in Algorithm 4, based on the outputs of Algorithm 3. 321 | \subsection{Usage}{ 322 | \if{html}{\out{
}}\preformatted{Tracker$regularize()}\if{html}{\out{
}} 323 | } 324 | 325 | \subsection{Arguments}{ 326 | \if{html}{\out{
}} 327 | \describe{ 328 | \item{\code{node}}{Character name of the node we are currently visiting.} 329 | } 330 | \if{html}{\out{
}} 331 | } 332 | } 333 | \if{html}{\out{
}} 334 | \if{html}{\out{}} 335 | \if{latex}{\out{\hypertarget{method-Tracker-calculate_ppr}{}}} 336 | \subsection{Method \code{calculate_ppr()}}{ 337 | Main driver function to perform the computations outlined in 338 | Algorithm 3. 339 | \subsection{Usage}{ 340 | \if{html}{\out{
}}\preformatted{Tracker$calculate_ppr()}\if{html}{\out{
}} 341 | } 342 | 343 | \subsection{Arguments}{ 344 | \if{html}{\out{
}} 345 | \describe{ 346 | \item{\code{node}}{Character name of the node we are currently visiting.} 347 | } 348 | \if{html}{\out{
}} 349 | } 350 | } 351 | \if{html}{\out{
}} 352 | \if{html}{\out{}} 353 | \if{latex}{\out{\hypertarget{method-Tracker-clone}{}}} 354 | \subsection{Method \code{clone()}}{ 355 | The objects of this class are cloneable with this method. 356 | \subsection{Usage}{ 357 | \if{html}{\out{
}}\preformatted{Tracker$clone(deep = FALSE)}\if{html}{\out{
}} 358 | } 359 | 360 | \subsection{Arguments}{ 361 | \if{html}{\out{
}} 362 | \describe{ 363 | \item{\code{deep}}{Whether to make a deep clone.} 364 | } 365 | \if{html}{\out{
}} 366 | } 367 | } 368 | } 369 | -------------------------------------------------------------------------------- /man/aPPR-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aPPR-package.R 3 | \docType{package} 4 | \name{aPPR-package} 5 | \alias{aPPR} 6 | \alias{aPPR-package} 7 | \title{aPPR: Approximate Personalized PageRank} 8 | \description{ 9 | Calculates approximate and regularized personalized PageRank vectors for massive graphs, including those that can only be queried via an API. Regularization allows discovery of community structure under some stochastic block models. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/RoheLab/aPPR} 15 | \item Report bugs at \url{https://github.com/RoheLab/aPPR/issues} 16 | } 17 | 18 | } 19 | \author{ 20 | \strong{Maintainer}: Alex Hayes \email{alexpghayes@gmail.com} (\href{https://orcid.org/0000-0002-4985-5160}{ORCID}) [copyright holder] 21 | 22 | Authors: 23 | \itemize{ 24 | \item Fan Chen \email{fan.chen@wisc.edu} (\href{https://orcid.org/0000-0003-4508-6023}{ORCID}) 25 | \item Karl Rohe 26 | } 27 | 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/abstract_graph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/abstract-graph.R 3 | \name{abstract_graph} 4 | \alias{abstract_graph} 5 | \title{Create an abstract graph object} 6 | \usage{ 7 | abstract_graph(subclass, ...) 8 | } 9 | \arguments{ 10 | \item{subclass}{Desired subclass (character).} 11 | 12 | \item{...}{Other arguments to pass to \code{list()}. See 13 | \code{\link[=rtweet_graph]{rtweet_graph()}} for an example.} 14 | } 15 | \description{ 16 | Could be an actual graph object, or a graph such as the Twitter 17 | following network defined implicitly via API requests, etc. 18 | The abstract graph is just a list with \code{abstract_graph} class 19 | and your desired subclass. 20 | } 21 | -------------------------------------------------------------------------------- /man/appr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/aPPR.R, R/graph-igraph.R, R/graph-rtweet.R 3 | \name{appr} 4 | \alias{appr} 5 | \alias{appr.igraph} 6 | \alias{appr.rtweet_graph} 7 | \title{Approximate personalized pageranks} 8 | \usage{ 9 | appr( 10 | graph, 11 | seeds, 12 | ..., 13 | alpha = 0.15, 14 | epsilon = 1e-06, 15 | tau = NULL, 16 | max_visits = Inf 17 | ) 18 | 19 | \method{appr}{igraph}(graph, seeds, ...) 20 | 21 | \method{appr}{rtweet_graph}(graph, seeds, ...) 22 | } 23 | \arguments{ 24 | \item{graph}{An \code{\link[=abstract_graph]{abstract_graph()}} object, such as that created by 25 | \code{\link[=rtweet_graph]{rtweet_graph()}}. This argument is required.} 26 | 27 | \item{seeds}{A character vector of seeds for the personalized pagerank. 28 | The personalized pagerank will return to each of these seeds with 29 | probability \code{alpha} at each node transition. At the moment, 30 | all seeds are given equal weighting. This argument is required.} 31 | 32 | \item{...}{Ignored. Passing arguments to \code{...} results in a warning.} 33 | 34 | \item{alpha}{Teleportation constant. The teleportation constant is the 35 | probability of returning to a seed node at each node transition. 36 | \code{alpha} must be a valid probabilty; that is, between zero and one. 37 | Defaults to \code{0.15}. This is the inverse of the "dampening factor" 38 | in the original PageRank paper, so \code{alpha = 0.15} corresponds 39 | to a dampening factor of \code{0.85}. Runtime is proportional to 40 | \code{1 / (epsilon * alpha)}, so small \code{alpha} can result in long 41 | runtimes.} 42 | 43 | \item{epsilon}{Desired accuracy of approximation. \code{epsilon} must be 44 | a small positive number. Defaults to \code{1e-6}. \code{aPPR} guarantees that 45 | approximated personalized pageranks are uniformly within \code{epsilon} of 46 | their true value. That is, the approximation is guaranteed to be good 47 | in an L-infinity sense. This does not guarantee, however, that 48 | a ranking of nodes by aPPR is close to a ranking of nodes by PPR. 49 | 50 | For Twitter graphs, we recommend testing your code with \code{1e-4} or \code{1e-5}, 51 | using \code{1e-6} for exploration, and \code{1e-7} to \code{1e-8} for final results, 52 | although these numbers are very rough. It also perfectly reasonable 53 | to run \code{aPPR} for a given number of steps (set via \code{max_visits}), 54 | and then note the approximation accuracy of your results. Internally, 55 | \code{aPPR} keeps a running estimate of achieved accuracy that is always valid. 56 | 57 | Anytime you would like to explore more of the graph, you can simply 58 | decrease \code{epsilon}. So you can start with \code{epsilon = 1e-5} and then 59 | gradually decrease \code{epsilon} until you have a sample of the graph 60 | that you are happy with. 61 | 62 | Also note that runtime is proportional to \code{1 / (epsilon * alpha)}, 63 | so small \code{epsilon} can result in long runtimes.} 64 | 65 | \item{tau}{Regularization term. Additionally inflates the in-degree 66 | of each observation by this term by performing the degree 67 | adjustment described in Algorithm 3 and Algorithm 4, which 68 | are described in \code{vignette("Mathematical details")}. Defaults to 69 | \code{NULL}, in which case \code{tau} is set to the average in-degree of 70 | the observed nodes. In general, setting it's reasonable to 71 | set \code{tau} to the average in-degree of the graph.} 72 | 73 | \item{max_visits}{Maximum number of unique nodes to visit. Should be a 74 | positive integer. Defaults to \code{Inf}, such that there is no upper bound 75 | on the number of unique nodes to visit. Useful when you want to specify a 76 | fixed amount of computation (or API calls) to use rather than an 77 | error tolerance. We recommend debugging with \code{max_visits ~ 20}, 78 | exploration with \code{max_visits} in the hundreds, and \code{max_visits} in the 79 | thousands to ten of thousands for precise results, although this is a 80 | very rough heuristic.} 81 | } 82 | \value{ 83 | A \code{\link[=Tracker]{Tracker()}} object. Most relevant is the \code{stats} field, 84 | a \code{\link[tibble:tibble]{tibble::tibble()}} with the following columns: 85 | \itemize{ 86 | \item \code{name}: Name of a node (character). 87 | \item \code{p}: Current estimate of residual per out-degree for a node. 88 | \item \code{r}: Estimated error of pagerank estimate for a node. 89 | \item \code{in_degree}: Number of incoming edges to a node. 90 | \item \code{out_degree}: Number of outcoming edges from a node. 91 | \item \code{degree_adjusted}: The personalized pagerank divided by the 92 | node in-degree. 93 | \item \code{regularized}: The personalized pagerank divide by the node 94 | in-degree plus \code{tau}. 95 | } 96 | 97 | When computing personalized pageranks for Twitter users (either 98 | via \code{\link[=rtweet_graph]{rtweet_graph()}}, \code{name} is given 99 | as a user ID, not a screen name, regardless of how the seed nodes 100 | were specified. 101 | } 102 | \description{ 103 | Computes the personalized pagerank for specified seeds using the 104 | \code{ApproximatePageRank} algorithm of Andersen et al. (2006). Computes 105 | degree-adjustments and degree-regularization of personalized 106 | pagerank vectors as described in Algorithms 3 and 4 of Chen et al. (2019). 107 | These algorithms are randomized; if results are unstable across 108 | multiple runs, decrease \code{epsilon}. 109 | } 110 | \examples{ 111 | 112 | library(aPPR) 113 | library(igraph) 114 | 115 | set.seed(27) 116 | 117 | graph <- rtweet_graph() 118 | 119 | \dontrun{ 120 | appr(graph, "alexpghayes") 121 | } 122 | 123 | graph2 <- sample_pa(100) 124 | 125 | # this creates a Tracker object 126 | ppr_results <- appr(graph2, seeds = "5") 127 | 128 | # the portion of the Tracker object you probably care about 129 | ppr_results$stats 130 | 131 | } 132 | \references{ 133 | \enumerate{ 134 | \item Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349. 135 | \item Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44. 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /man/check.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/abstract-graph.R 3 | \name{check} 4 | \alias{check} 5 | \title{Check if a node an abstract graph is acceptable for inclusion in PPR} 6 | \usage{ 7 | check(graph, nodes) 8 | } 9 | \arguments{ 10 | \item{graph}{A graph object.} 11 | 12 | \item{nodes}{The name(s) of node(s) in \code{graph} as a character vector.} 13 | } 14 | \value{ 15 | The subset of \code{nodes} that are acceptable for inclusion. This 16 | can be a character vector of length zero if necessary. It is critical 17 | that no entries of \code{nodes} are duplicated in this output, so we 18 | recommend calling \code{unique()} if there is any potential for repeats 19 | in your checking good. 20 | } 21 | \description{ 22 | Inclusion criteria: 23 | } 24 | \details{ 25 | \itemize{ 26 | \item At least one outgoing edge 27 | \item Can get in degree and out degree of node 28 | \item Can get all nodes connected to \code{node} / the 1-hop neighborhood 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /man/neighborhood.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/abstract-graph.R 3 | \name{neighborhood} 4 | \alias{neighborhood} 5 | \title{Get the neighborhood of a node in a graph} 6 | \usage{ 7 | neighborhood(graph, node) 8 | } 9 | \arguments{ 10 | \item{graph}{A graph object.} 11 | 12 | \item{node}{The name of a single node in \code{graph} as a character vector.} 13 | } 14 | \value{ 15 | A character vector of all nodes in \code{graph} connected such that 16 | there is an outgoing edge for \code{node} to those nodes. This should 17 | never be empty, as \code{neighborhood()} should not be called on nodes 18 | that fail \code{check()}, and \code{check()} enforces that nodes have out-degree 19 | of at least one. It is critical node names are duplicated in the 20 | output recommend calling \code{unique()} if there is any potential for 21 | for that to occur. 22 | } 23 | \description{ 24 | That is, find all nodes connected to \code{node} by an outgoing edge. 25 | This function is memorized to avoid making repeated API queries. 26 | } 27 | -------------------------------------------------------------------------------- /man/node_degrees.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/abstract-graph.R 3 | \name{node_degrees} 4 | \alias{node_degrees} 5 | \title{Get the in-degree and out-degree of nodes in an abstract graph} 6 | \usage{ 7 | node_degrees(graph, nodes) 8 | } 9 | \arguments{ 10 | \item{graph}{A graph object.} 11 | 12 | \item{nodes}{The name(s) of node(s) in \code{graph} as a character vector. 13 | Methods may assume that there are no repeated values in \code{nodes}.} 14 | } 15 | \value{ 16 | A \code{\link[=data.frame]{data.frame()}} with one row for every node in \code{nodes} and 17 | two columns: \code{in_degree} and \code{out_degree}. In a symmetric graph, 18 | \code{in_degree} and \code{out_degree} should match. 19 | } 20 | \description{ 21 | This function is only called nodes that have been \code{\link[=check]{check()}}'d. It is 22 | safe to assume that \code{nodes} is non-empty. 23 | } 24 | -------------------------------------------------------------------------------- /man/rtweet_graph.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/graph-rtweet.R 3 | \name{rtweet_graph} 4 | \alias{rtweet_graph} 5 | \title{Create an abstract representation of the Twitter friendship graph} 6 | \usage{ 7 | rtweet_graph(retryonratelimit = TRUE, verbose = TRUE, n = 5000) 8 | } 9 | \arguments{ 10 | \item{retryonratelimit}{If \code{TRUE}, and a rate limit is exhausted, will wait 11 | until it refreshes. Most Twitter rate limits refresh every 15 minutes. 12 | If \code{FALSE}, and the rate limit is exceeded, the function will terminate 13 | early with a warning; you'll still get back all results received up to 14 | that point. The default value, \code{NULL}, consults the option 15 | \code{rtweet.retryonratelimit} so that you can globally set it to \code{TRUE}, 16 | if desired. 17 | 18 | If you expect a query to take hours or days to perform, you should not 19 | rely solely on \code{retryonratelimit} because it does not handle other common 20 | failure modes like temporarily losing your internet connection.} 21 | 22 | \item{verbose}{Show progress bars and other messages indicating current 23 | progress?} 24 | 25 | \item{n}{Desired number of results to return. Results are downloaded 26 | in pages when \code{n} is large; the default value will download a single 27 | page. Set \code{n = Inf} to download as many results as possible. 28 | 29 | The Twitter API rate limits the number of requests you can perform 30 | in each 15 minute period. The easiest way to download more than that is 31 | to use \code{retryonratelimit = TRUE}. 32 | 33 | You are not guaranteed to get exactly \code{n} results back. You will get 34 | fewer results when tweets have been deleted or if you hit a rate limit. 35 | You will get more results if you ask for a number of tweets that's not 36 | a multiple of page size, e.g. if you request \code{n = 150} and the page 37 | size is 200, you'll get 200 results back.} 38 | } 39 | \description{ 40 | Signifies that \code{aPPR} should query the Twitter friendship graph via 41 | \code{rtweet}. 42 | } 43 | -------------------------------------------------------------------------------- /man/update.Tracker.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/update.R 3 | \name{update.Tracker} 4 | \alias{update.Tracker} 5 | \title{Update a Tracker object} 6 | \usage{ 7 | \method{update}{Tracker}(object, ..., epsilon, max_visits) 8 | } 9 | \arguments{ 10 | \item{object}{The \code{Tracker} object to update.} 11 | 12 | \item{...}{Ignored. Passing arguments to \code{...} results in a warning.} 13 | 14 | \item{epsilon}{Desired accuracy of approximation. \code{epsilon} must be 15 | a small positive number. Defaults to \code{1e-6}. \code{aPPR} guarantees that 16 | approximated personalized pageranks are uniformly within \code{epsilon} of 17 | their true value. That is, the approximation is guaranteed to be good 18 | in an L-infinity sense. This does not guarantee, however, that 19 | a ranking of nodes by aPPR is close to a ranking of nodes by PPR. 20 | 21 | For Twitter graphs, we recommend testing your code with \code{1e-4} or \code{1e-5}, 22 | using \code{1e-6} for exploration, and \code{1e-7} to \code{1e-8} for final results, 23 | although these numbers are very rough. It also perfectly reasonable 24 | to run \code{aPPR} for a given number of steps (set via \code{max_visits}), 25 | and then note the approximation accuracy of your results. Internally, 26 | \code{aPPR} keeps a running estimate of achieved accuracy that is always valid. 27 | 28 | Anytime you would like to explore more of the graph, you can simply 29 | decrease \code{epsilon}. So you can start with \code{epsilon = 1e-5} and then 30 | gradually decrease \code{epsilon} until you have a sample of the graph 31 | that you are happy with. 32 | 33 | Also note that runtime is proportional to \code{1 / (epsilon * alpha)}, 34 | so small \code{epsilon} can result in long runtimes.} 35 | 36 | \item{max_visits}{Maximum number of unique nodes to visit. Should be a 37 | positive integer. Defaults to \code{Inf}, such that there is no upper bound 38 | on the number of unique nodes to visit. Useful when you want to specify a 39 | fixed amount of computation (or API calls) to use rather than an 40 | error tolerance. We recommend debugging with \code{max_visits ~ 20}, 41 | exploration with \code{max_visits} in the hundreds, and \code{max_visits} in the 42 | thousands to ten of thousands for precise results, although this is a 43 | very rough heuristic.} 44 | } 45 | \value{ 46 | A new \code{Tracker} object with a new value of \code{epsilon}. 47 | } 48 | \description{ 49 | Typically because results are insufficiently precise. 50 | } 51 | \details{ 52 | At the moment, only supports changing \code{epsilon}. If there is interest, 53 | we can consider allowing updates to \code{tau}, \code{alpha} and \code{seeds} in the 54 | future. 55 | } 56 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(aPPR) 3 | 4 | test_check("aPPR") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-matches-igraph.R: -------------------------------------------------------------------------------- 1 | skip_if_not_installed("igraph") 2 | library(igraph) 3 | 4 | library(logger) 5 | 6 | log_threshold(WARN, namespace = "aPPR") 7 | 8 | prefer <- function(node, total_nodes = 100) { 9 | alpha <- numeric(total_nodes) 10 | alpha[node] <- 1 11 | alpha 12 | } 13 | 14 | test_that("matches igraph calculations on connected graph", { 15 | 16 | # graph without sink nodes (i.e. every node has an outgoing edge) 17 | g3 <- make_ring(10) 18 | 19 | # make every node a seed node to recover page rank 20 | appr_ppr <- appr(g3, seeds = as.character(1:10)) 21 | 22 | # close enough but currently failing 23 | expect_equal(sum(appr_ppr$stats$p), 1, tolerance = 1e-4) 24 | 25 | appr_ppr2 <- appr(g3, seeds = "1") 26 | 27 | igraph_ppr <- page_rank(g3, personalized = prefer(1, 10))$vector 28 | 29 | # tolerance off by an order of magnitude again? 30 | expect_equal(sort(appr_ppr2$stats$p), sort(igraph_ppr), tolerance = 1e-4) 31 | }) 32 | 33 | # did this ever work? i don't think it should 34 | # 35 | # test_that("matches igraph calculations on graph with sink nodes", { 36 | # 37 | # set.seed(26) 38 | # 39 | # ig <- sample_pa(100) 40 | # 41 | # # make every node a seed node to recover page rank 42 | # appr_ppr <- appr(ig, seeds = as.character(2:10)) 43 | # 44 | # # close enough but currently failing 45 | # expect_equal(sum(appr_ppr$stats$p), 1, tolerance = 1e-5) 46 | # 47 | # appr_ppr2 <- appr(ig, seeds = "1") 48 | # 49 | # igraph_ppr <- page_rank(ig, personalized = prefer(1, 10))$vector 50 | # 51 | # # tolerance off by an order of magnitude again? 52 | # expect_equal(sort(appr_ppr2$stats$p), sort(igraph_ppr), tolerance = 1e-5) 53 | # }) 54 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/extending-appr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Extending aPPR" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{Extending aPPR} 6 | %\VignetteEngine{knitr::rmarkdown} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>" 14 | ) 15 | ``` 16 | 17 | Suppose you want to calculate personalized PageRanks for some graph that is not supported by `aPPR`. You can extend `aPPR` to arbitrary graph objects, so long the graph object is an S3 object with methods: 18 | 19 | - `check()` 20 | - `node_degrees()` 21 | - `neighborhood()` 22 | - `appr()` (optional) 23 | 24 | See the documentation for those S3 generics to understand the generic specification fully! 25 | 26 | We demonstrate how to implement these methods below for a new, custom graph object. In this case, we will consider the PubMed citation network, which we will interact via the PubMed API, using the `rentrez` package. First, we define a constructor function that returns a custom S3 graph object that subclasses `abstract_graph`. You can install rentrez with `pak::pak("ropensci/rentrez")`, and do not need to set up any authentication to begin using the API. 27 | 28 | ```{r} 29 | library(aPPR) 30 | library(logger) 31 | library(glue) 32 | library(rentrez) 33 | 34 | # constructor for PubMed graph object (defined over API) 35 | pubmed_graph <- function(max_attempts = 3) { 36 | if (!requireNamespace("rentrez", quietly = TRUE)) { 37 | stop( 38 | "`rentrez` package must be installed to use `pubmed_graph()`", 39 | call. = FALSE 40 | ) 41 | } 42 | 43 | agraph <- abstract_graph( 44 | subclass = "pubmed", 45 | max_attempts = max_attempts 46 | ) 47 | 48 | agraph 49 | } 50 | 51 | graph <- pubmed_graph() 52 | graph 53 | ``` 54 | 55 | Now we want to implement S3 methods for the `pubmed` object. In some cases, you can query data from a graph in large batches, but with the PubMed API it simpler (at least to my limited knowledge) to query node by node, with no bulk lookups. It turns out that we can get neighborhoods and node in-degree and node out-degree all at once, using `rentrez::entrez_link()`. We give the function three attempts (by default) to successfully complete this API call, since APIs sometimes fail. Then, since we will need this information repeatedly, we memoize the function, to avoid repeated calls to the API. 56 | 57 | ```{r} 58 | # one node at a time 59 | get_pubmed_data <- function(graph, node) { 60 | for (i in 1:graph$max_attempts) { 61 | log_trace( 62 | glue("Attempt {i}/{graph$max_attempts} to get node degrees: {node}") 63 | ) 64 | 65 | tryCatch( 66 | { 67 | cites <- entrez_link(dbfrom = "pubmed", db = "all", id = node) 68 | break 69 | }, 70 | error = function(cnd) { 71 | if (i == graph$max_attempts) { 72 | log_debug( 73 | glue("Maximum attempts to find neighborhood met, could not find: {node}") 74 | ) 75 | stop("Couldn't pull data for node") 76 | } 77 | } 78 | ) 79 | } 80 | 81 | data <- list( 82 | refs = unique(cites$links$pubmed_pubmed_refs), 83 | citedby = unique(cites$links$pubmed_pubmed_citedin) 84 | ) 85 | 86 | data$num_refs <- length(data$refs) 87 | data$num_citedby <- length(data$citedby) 88 | data 89 | } 90 | 91 | memo_get_pubmed_data <- memoise::memoise(get_pubmed_data) 92 | ``` 93 | 94 | Now we test the function. I'm not currently sure that it's working: it's suspicious for two papers to have in-degree and out-degree all equal to 18 -- we need to dig into this and find out if the API is limited to returning a maximum of API results in a single call, for example. 95 | 96 | ```{r} 97 | good_node_ids <- c("30345262", "29624432", "29867837") 98 | bad_node_id <- "I am a pumpkin" 99 | mixed_node_ids <- c(good_node_ids, bad_node_id) 100 | 101 | # this is suspicious to me, something seems wrong here 102 | memo_get_pubmed_data(graph, good_node_ids[1]) 103 | 104 | # suspicious that the number of in-cites and out-cites matches, and that it 105 | # matches across both papers! TODO: investigate! 106 | memo_get_pubmed_data(graph, good_node_ids[2]) 107 | memo_get_pubmed_data(graph, good_node_ids[3]) 108 | 109 | # check that we handle bad node ids in some reliable way, in this case 110 | # it looks we get empty results 111 | memo_get_pubmed_data(graph, bad_node_id) 112 | ``` 113 | 114 | ```{r} 115 | #' Check method for `pubmed` graph objects 116 | #' 117 | #' @param graph A `pubmed` graph object 118 | #' @param nodes A **character** vector of node ids. **Can be empty!** 119 | #' 120 | #' @return A **character** vector of node ids that we can reach in the graph. 121 | #' For example, some nodes ids may not be reachable due to API failures, 122 | #' or, more generally, permissions failures. 123 | #' 124 | #' If `nodes` is the empty vector, returns the empty vector. Be sure to 125 | #' handle this edge case. 126 | check.pubmed <- function(graph, nodes) { 127 | log_debug(glue("Checking nodes")) 128 | 129 | # handle the case where no nodes are passed 130 | if (length(nodes) < 1) { 131 | return(character(0)) 132 | } 133 | 134 | good_nodes <- character(0) 135 | 136 | for (node in nodes) { 137 | node_data <- memo_get_pubmed_data(graph, node) 138 | 139 | # this is a sufficient check to see if (1) the node is in pubmed, (2) 140 | # we can pull it's neighborhood, and (3) it has at least one 141 | # incoming or outgoing citation 142 | 143 | if (node_data$num_refs + node_data$num_citedby > 0) { 144 | log_trace(glue("Checked node: {node} (good)")) 145 | good_nodes <- c(good_nodes, node) 146 | next 147 | } 148 | 149 | log_trace(glue("Checked node: {node} (bad)")) 150 | } 151 | 152 | good_nodes 153 | } 154 | ``` 155 | 156 | Now we test our implementation. To do this, we should give at least one good node id, and at least one bad node id. Only the good node id should be returned. 157 | 158 | ```{r} 159 | check(graph, good_node_ids) 160 | check(graph, bad_node_id) 161 | check(graph, mixed_node_ids) 162 | ``` 163 | 164 | ```{r} 165 | #' Degree method for `pubmed` graph objects 166 | #' 167 | #' @param graph A `pubmed` graph object 168 | #' @param nodes A **character** vector of node ids. **Cannot be empty.** Should 169 | #' not contain duplicates if `check()` is properly implemented and does 170 | #' not output duplicates. 171 | #' 172 | #' @return A list, with two elements, `in_degree` and `out_degree`. Both 173 | #' should be the same length as `nodes`, and match the order of `nodes`. 174 | #' 175 | node_degrees.pubmed <- function(graph, nodes) { 176 | log_debug(glue("Getting node degrees")) 177 | 178 | degrees <- list( 179 | in_degree = integer(length(nodes)), 180 | out_degree = integer(length(nodes)) 181 | ) 182 | 183 | for (i in seq_along(nodes)) { 184 | log_debug(glue("Getting node degrees for node: {nodes[i]}")) 185 | node_data <- memo_get_pubmed_data(graph, nodes[i]) 186 | 187 | # must treat pubmed like an undirected graph. otherwise it's a citation 188 | # network, and thus a tree, and thus no pair of nodes is mutually 189 | # reachable, thus pagerank is not defined 190 | 191 | degrees$in_degree[i] <- node_data$num_citedby + node_data$num_refs 192 | degrees$out_degree[i] <- node_data$num_citedby + node_data$num_refs 193 | 194 | log_trace(glue("In-degree for node: {degrees$in_degree[i]}")) 195 | log_trace(glue("Out-degree for node: {degrees$out_degree[i]}")) 196 | } 197 | 198 | log_debug(glue("Done getting node degrees")) 199 | 200 | degrees 201 | } 202 | ``` 203 | 204 | To test this method, we should pass a character vector of several good node ids. 205 | 206 | ```{r} 207 | # test with a single node 208 | node_degrees(graph, good_node_ids[1]) 209 | 210 | # test with multiple nodes! this is the key one! this is suspicious, and 211 | # means we need to check if our function memo_get_pubmed_data() is working 212 | node_degrees(graph, good_node_ids) 213 | ``` 214 | 215 | ```{r} 216 | #' Neighborhood method for `pubmed` graph objects 217 | #' 218 | #' @param graph A `pubmed` graph object 219 | #' @param nodes A length one character vector, for a node in the graph with 220 | #' at least one outgoing edge. 221 | #' 222 | #' @return A **character** vector of node ids for the graph neighborhood. 223 | #' Should be a vector of length at least one (if the `check()` method was 224 | #' implemented correctly), and should not contain duplicates. 225 | neighborhood.pubmed <- function(graph, node) { 226 | if (length(node) > 1) { 227 | stop("`node` must be a character vector of length one.") 228 | } 229 | 230 | log_debug(glue("Getting neighborhood: {node}")) 231 | node_data <- memo_get_pubmed_data(graph, node) 232 | log_debug(glue("Done getting neighborhood: {node}")) 233 | unique(node_data$refs, node_data$citedby) 234 | } 235 | ``` 236 | 237 | ```{r} 238 | neighborhood(graph, good_node_ids[1]) 239 | neighborhood(graph, good_node_ids[2]) 240 | neighborhood(graph, good_node_ids[3]) 241 | ``` 242 | 243 | Lastly, you can optionally implement an `appr` method for your abstract graph subclass. In the `appr` method for the subclass, you can do things like: 244 | 245 | - Add functionality to convert a convenient seed node name (in this, possibly something like a DOI) into the internal node name representation (see `appr.rtweet_graph()` for an example of this) 246 | - Checks that you have appropriate authorization to pull information about the seed nodes 247 | - Etc, etc 248 | 249 | This custom subclass method will run before the general `appr.abstract_graph()`. We don't have a particular need to do anything of that here, so we do not. 250 | 251 | ### Debugging 252 | 253 | If you are accessing a graph over an API, it's likely that you will encounter edge cases where the API returns no data, or data in a format that you did not expect. We highly recommend using logging to debug your implementation when this happens, using the `logger` library. See that `logger` documentation for details. 254 | 255 | Find any errors, fix, and rinse and repeat until you've completed the likely unpleasant task of tracking down all the edges cases in the API. In our, we don't seem to find any edge cases right away. 256 | 257 | ```{r} 258 | library(logger) 259 | 260 | # set logging threshold for code you just wrote, if desired 261 | log_threshold(TRACE) 262 | 263 | # set logging threshold for aPPR package functions, if desired 264 | log_threshold(DEBUG, namespace = "aPPR") 265 | 266 | appr( 267 | graph, # the graph to work with 268 | seeds = good_node_ids[1], # name of seed node (character) 269 | epsilon = 0.0005, # desired approximation quality 270 | max_visits = 10 # bound computation since this is an example 271 | ) 272 | ``` 273 | --------------------------------------------------------------------------------