├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   ├── pkgdown.yaml
    │   ├── pr-commands.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    ├── aPPR-package.R
    ├── aPPR.R
    ├── abstract-graph.R
    ├── graph-igraph.R
    ├── graph-rtweet.R
    ├── tracker.R
    └── update.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── aPPR.Rproj
├── codecov.yml
├── cran-comments.md
├── man
    ├── Tracker.Rd
    ├── aPPR-package.Rd
    ├── abstract_graph.Rd
    ├── appr.Rd
    ├── check.Rd
    ├── neighborhood.Rd
    ├── node_degrees.Rd
    ├── rtweet_graph.Rd
    └── update.Tracker.Rd
├── tests
    ├── testthat.R
    └── testthat
    │   └── test-matches-igraph.R
└── vignettes
    ├── .gitignore
    └── extending-appr.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^aPPR\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^README\.Rmd$
 4 | ^LICENSE\.md$
 5 | ^_pkgdown\.yml$
 6 | ^docs$
 7 | ^pkgdown$
 8 | ^codecov\.yml$
 9 | ^\.github$
10 | ^cran-comments\.md$
11 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 |     branches: [main, master]
12 | 
13 | name: R-CMD-check
14 | 
15 | jobs:
16 |   R-CMD-check:
17 |     runs-on: ${{ matrix.config.os }}
18 | 
19 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
20 | 
21 |     strategy:
22 |       fail-fast: false
23 |       matrix:
24 |         config:
25 |           - {os: macos-latest,   r: 'release'}
26 | 
27 |           - {os: windows-latest, r: 'release'}
28 |           # use 4.1 to check with rtools40's older compiler
29 |           - {os: windows-latest, r: '4.1'}
30 | 
31 |           - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
32 |           - {os: ubuntu-latest,   r: 'release'}
33 |           - {os: ubuntu-latest,   r: 'oldrel-1'}
34 |           - {os: ubuntu-latest,   r: 'oldrel-2'}
35 |           - {os: ubuntu-latest,   r: 'oldrel-3'}
36 | 
37 |     env:
38 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
39 |       R_KEEP_PKG_SOURCE: yes
40 | 
41 |     steps:
42 |       - uses: actions/checkout@v3
43 | 
44 |       - uses: r-lib/actions/setup-pandoc@v2
45 | 
46 |       - uses: r-lib/actions/setup-r@v2
47 |         with:
48 |           r-version: ${{ matrix.config.r }}
49 |           http-user-agent: ${{ matrix.config.http-user-agent }}
50 |           use-public-rspm: true
51 | 
52 |       - uses: r-lib/actions/setup-r-dependencies@v2
53 |         with:
54 |           extra-packages: any::rcmdcheck
55 |           needs: check
56 | 
57 |       - uses: r-lib/actions/check-r-package@v2
58 |         with:
59 |           upload-snapshots: true
60 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v3
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.4.1
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | name: Commands
 8 | 
 9 | jobs:
10 |   document:
11 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
12 |     name: document
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |     steps:
17 |       - uses: actions/checkout@v3
18 | 
19 |       - uses: r-lib/actions/pr-fetch@v2
20 |         with:
21 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
22 | 
23 |       - uses: r-lib/actions/setup-r@v2
24 |         with:
25 |           use-public-rspm: true
26 | 
27 |       - uses: r-lib/actions/setup-r-dependencies@v2
28 |         with:
29 |           extra-packages: any::roxygen2
30 |           needs: pr-document
31 | 
32 |       - name: Document
33 |         run: roxygen2::roxygenise()
34 |         shell: Rscript {0}
35 | 
36 |       - name: commit
37 |         run: |
38 |           git config --local user.name "$GITHUB_ACTOR"
39 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
40 |           git add man/\* NAMESPACE
41 |           git commit -m 'Document'
42 | 
43 |       - uses: r-lib/actions/pr-push@v2
44 |         with:
45 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
46 | 
47 |   style:
48 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
49 |     name: style
50 |     runs-on: ubuntu-latest
51 |     env:
52 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
53 |     steps:
54 |       - uses: actions/checkout@v3
55 | 
56 |       - uses: r-lib/actions/pr-fetch@v2
57 |         with:
58 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
59 | 
60 |       - uses: r-lib/actions/setup-r@v2
61 | 
62 |       - name: Install dependencies
63 |         run: install.packages("styler")
64 |         shell: Rscript {0}
65 | 
66 |       - name: Style
67 |         run: styler::style_pkg()
68 |         shell: Rscript {0}
69 | 
70 |       - name: commit
71 |         run: |
72 |           git config --local user.name "$GITHUB_ACTOR"
73 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
74 |           git add \*.R
75 |           git commit -m 'Style'
76 | 
77 |       - uses: r-lib/actions/pr-push@v2
78 |         with:
79 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
80 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: test-coverage
10 | 
11 | jobs:
12 |   test-coverage:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::covr
27 |           needs: coverage
28 | 
29 |       - name: Test coverage
30 |         run: |
31 |           covr::codecov(
32 |             quiet = FALSE,
33 |             clean = FALSE,
34 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
35 |           )
36 |         shell: Rscript {0}
37 | 
38 |       - name: Show testthat output
39 |         if: always()
40 |         run: |
41 |           ## --------------------------------------------------------------------
42 |           find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
43 |         shell: bash
44 | 
45 |       - name: Upload test results
46 |         if: failure()
47 |         uses: actions/upload-artifact@v3
48 |         with:
49 |           name: coverage-test-failures
50 |           path: ${{ runner.temp }}/package
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | inst/doc
5 | docs
6 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: aPPR
 2 | Title: Approximate Personalized PageRank
 3 | Version: 0.0.0.9200
 4 | Authors@R: c(
 5 |     person("Alex", "Hayes", , "alexpghayes@gmail.com", role = c("aut", "cre", "cph"),
 6 |            comment = c(ORCID = "0000-0002-4985-5160")),
 7 |     person("Fan", "Chen", , "fan.chen@wisc.edu", role = "aut",
 8 |            comment = c(ORCID = "0000-0003-4508-6023")),
 9 |     person("Karl", "Rohe", role = "aut")
10 |   )
11 | Description: Calculates approximate and regularized personalized PageRank
12 |     vectors for massive graphs, including those that can only be queried
13 |     via an API.  Regularization allows discovery of community structure
14 |     under some stochastic block models.
15 | License: MIT + file LICENSE
16 | URL: https://rohelab.github.io/aPPR/, https://github.com/RoheLab/aPPR
17 | BugReports: https://github.com/RoheLab/aPPR/issues
18 | Imports: 
19 |     ellipsis,
20 |     glue,
21 |     logger,
22 |     memoise,
23 |     pander,
24 |     R6,
25 |     tibble
26 | Suggests: 
27 |     covr,
28 |     igraph (>= 1.2.5),
29 |     knitr,
30 |     rentrez,
31 |     rmarkdown,
32 |     rtweet (>= 0.7.0.9011),
33 |     testthat (>= 3.0.0)
34 | Remotes: 
35 |     ropensci/rtweet
36 | Encoding: UTF-8
37 | LazyData: true
38 | Roxygen: list(markdown = TRUE)
39 | RoxygenNote: 7.2.3
40 | Collate: 
41 |     'aPPR-package.R'
42 |     'abstract-graph.R'
43 |     'aPPR.R'
44 |     'graph-igraph.R'
45 |     'graph-rtweet.R'
46 |     'tracker.R'
47 |     'update.R'
48 | Config/testthat/edition: 3
49 | VignetteBuilder: knitr
50 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2022
2 | COPYRIGHT HOLDER: aPPR authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2022 aPPR authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(appr,abstract_graph)
 4 | S3method(appr,igraph)
 5 | S3method(appr,rtweet_graph)
 6 | S3method(print,abstract_graph)
 7 | S3method(update,Tracker)
 8 | export(abstract_graph)
 9 | export(appr)
10 | export(check)
11 | export(neighborhood)
12 | export(node_degrees)
13 | export(rtweet_graph)
14 | import(logger)
15 | import(pander)
16 | importFrom(R6,R6Class)
17 | importFrom(glue,glue)
18 | importFrom(memoise,memoise)
19 | importFrom(tibble,tibble)
20 | 


--------------------------------------------------------------------------------
/R/aPPR-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | # The following block is used by usethis to automatically manage
 5 | # roxygen namespace tags. Modify with care!
 6 | ## usethis namespace: start
 7 | ## usethis namespace: end
 8 | NULL
 9 | 
10 | #' @import logger
11 | #' @importFrom glue glue
12 | NULL
13 | 
14 | #' @import pander
15 | .onLoad <- function(libname, pkgname) {
16 |   log_formatter(formatter_pander, namespace = pkgname)
17 | }
18 | 


--------------------------------------------------------------------------------
/R/aPPR.R:
--------------------------------------------------------------------------------
  1 | #' Approximate personalized pageranks
  2 | #'
  3 | #' Computes the personalized pagerank for specified seeds using the
  4 | #' `ApproximatePageRank` algorithm of Andersen et al. (2006). Computes
  5 | #' degree-adjustments and degree-regularization of personalized
  6 | #' pagerank vectors as described in Algorithms 3 and 4 of Chen et al. (2019).
  7 | #' These algorithms are randomized; if results are unstable across
  8 | #' multiple runs, decrease `epsilon`.
  9 | #'
 10 | #' @param graph An [abstract_graph()] object, such as that created by
 11 | #'   [rtweet_graph()]. This argument is required.
 12 | #'
 13 | #' @param seeds A character vector of seeds for the personalized pagerank.
 14 | #'   The personalized pagerank will return to each of these seeds with
 15 | #'   probability `alpha` at each node transition. At the moment,
 16 | #'   all seeds are given equal weighting. This argument is required.
 17 | #'
 18 | #' @param alpha Teleportation constant. The teleportation constant is the
 19 | #'   probability of returning to a seed node at each node transition.
 20 | #'   `alpha` must be a valid probabilty; that is, between zero and one.
 21 | #'   Defaults to `0.15`. This is the inverse of the "dampening factor"
 22 | #'   in the original PageRank paper, so `alpha = 0.15` corresponds
 23 | #'   to a dampening factor of `0.85`. Runtime is proportional to
 24 | #'   `1 / (epsilon * alpha)`, so small `alpha` can result in long
 25 | #'   runtimes.
 26 | #'
 27 | #' @param epsilon Desired accuracy of approximation. `epsilon` must be
 28 | #'   a small positive number. Defaults to `1e-6`. `aPPR` guarantees that
 29 | #'   approximated personalized pageranks are uniformly within `epsilon` of
 30 | #'   their true value. That is, the approximation is guaranteed to be good
 31 | #'   in an L-infinity sense. This does not guarantee, however, that
 32 | #'   a ranking of nodes by aPPR is close to a ranking of nodes by PPR.
 33 | #'
 34 | #'   For Twitter graphs, we recommend testing your code with `1e-4` or `1e-5`,
 35 | #'   using `1e-6` for exploration, and `1e-7` to `1e-8` for final results,
 36 | #'   although these numbers are very rough. It also perfectly reasonable
 37 | #'   to run `aPPR` for a given number of steps (set via `max_visits`),
 38 | #'   and then note the approximation accuracy of your results. Internally,
 39 | #'   `aPPR` keeps a running estimate of achieved accuracy that is always valid.
 40 | #'
 41 | #'   Anytime you would like to explore more of the graph, you can simply
 42 | #'   decrease `epsilon`. So you can start with `epsilon = 1e-5` and then
 43 | #'   gradually decrease `epsilon` until you have a sample of the graph
 44 | #'   that you are happy with.
 45 | #'
 46 | #'   Also note that runtime is proportional to `1 / (epsilon * alpha)`,
 47 | #'   so small `epsilon` can result in long runtimes.
 48 | #'
 49 | #' @param tau Regularization term. Additionally inflates the in-degree
 50 | #'   of each observation by this term by performing the degree
 51 | #'   adjustment described in Algorithm 3 and Algorithm 4, which
 52 | #'   are described in `vignette("Mathematical details")`. Defaults to
 53 | #'   `NULL`, in which case `tau` is set to the average in-degree of
 54 | #'   the observed nodes. In general, setting it's reasonable to
 55 | #'   set `tau` to the average in-degree of the graph.
 56 | #'
 57 | #' @param max_visits Maximum number of unique nodes to visit. Should be a
 58 | #'   positive integer. Defaults to `Inf`, such that there is no upper bound
 59 | #'   on the number of unique nodes to visit. Useful when you want to specify a
 60 | #'   fixed amount of computation (or API calls) to use rather than an
 61 | #'   error tolerance. We recommend debugging with `max_visits ~ 20`,
 62 | #'   exploration with `max_visits` in the hundreds, and `max_visits` in the
 63 | #'   thousands to ten of thousands for precise results, although this is a
 64 | #'   very rough heuristic.
 65 | #'
 66 | #' @param ... Ignored. Passing arguments to `...` results in a warning.
 67 | #'
 68 | #'
 69 | #' @return A [Tracker()] object. Most relevant is the `stats` field,
 70 | #'    a [tibble::tibble()] with the following columns:
 71 | #'
 72 | #'   - `name`: Name of a node (character).
 73 | #'   - `p`: Current estimate of residual per out-degree for a node.
 74 | #'   - `r`: Estimated error of pagerank estimate for a node.
 75 | #'   - `in_degree`: Number of incoming edges to a node.
 76 | #'   - `out_degree`: Number of outcoming edges from a node.
 77 | #'   - `degree_adjusted`: The personalized pagerank divided by the
 78 | #'     node in-degree.
 79 | #'   - `regularized`: The personalized pagerank divide by the node
 80 | #'     in-degree plus `tau`.
 81 | #'
 82 | #' When computing personalized pageranks for Twitter users (either
 83 | #' via [rtweet_graph()], `name` is given
 84 | #' as a user ID, not a screen name, regardless of how the seed nodes
 85 | #' were specified.
 86 | #'
 87 | #' @export
 88 | #'
 89 | #' @references
 90 | #'
 91 | #' 1. Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349.
 92 | #' 2. Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44.
 93 | #'
 94 | #' @examples
 95 | #'
 96 | #' library(aPPR)
 97 | #' library(igraph)
 98 | #'
 99 | #' set.seed(27)
100 | #'
101 | #' graph <- rtweet_graph()
102 | #'
103 | #' \dontrun{
104 | #' appr(graph, "alexpghayes")
105 | #' }
106 | #'
107 | #' graph2 <- sample_pa(100)
108 | #'
109 | #' # this creates a Tracker object
110 | #' ppr_results <- appr(graph2, seeds = "5")
111 | #'
112 | #' # the portion of the Tracker object you probably care about
113 | #' ppr_results$stats
114 | #'
115 | appr <- function(graph, seeds, ..., alpha = 0.15, epsilon = 1e-6, tau = NULL,
116 |                  max_visits = Inf) {
117 |   ellipsis::check_dots_used()
118 | 
119 |   if (alpha <= 0 || alpha >= 1)
120 |     stop("`alpha` must be strictly between zero and one.", call. = FALSE)
121 | 
122 |   if (epsilon <= 0 || epsilon >= 1)
123 |     stop("`epsilon` must be strictly between zero and one.", call. = FALSE)
124 | 
125 |   if (!is.null(tau) && tau < 0)
126 |     stop("`tau` must be greater than zero.", call. = FALSE)
127 | 
128 |   UseMethod("appr")
129 | }
130 | 
131 | #' @include abstract-graph.R
132 | #' @export
133 | appr.abstract_graph <- function(graph, seeds, ..., alpha = 0.15,
134 |                                 epsilon = 1e-6, tau = NULL,
135 |                                 max_visits = Inf) {
136 |   tracker <- Tracker$new(graph, alpha, epsilon, tau, max_visits)
137 | 
138 |   log_debug("Checking seed nodes ... ")
139 |   good_seeds <- check(graph, seeds)
140 |   log_debug(glue("Checking seed nodes ... good_seeds: {good_seeds}"))
141 |   log_debug("Checking seed nodes ... done")
142 | 
143 |   for (seed in seeds) {
144 | 
145 |     if (!(seed %in% good_seeds)) {
146 |       stop(
147 |         glue("Seed {seed} must be available and have positive out degree."),
148 |         call. = FALSE
149 |       )
150 |     }
151 | 
152 |     log_info(glue("Adding seed {seed} to tracker ..."))
153 |     tracker$add_seed(seed, preference = 1 / length(seeds))
154 |     log_info(glue("Adding seed {seed} to tracker ... done"))
155 | 
156 |   }
157 | 
158 |   tracker$calculate_ppr()
159 |   tracker$regularize()
160 |   tracker
161 | }
162 | 


--------------------------------------------------------------------------------
/R/abstract-graph.R:
--------------------------------------------------------------------------------
 1 | #' Create an abstract graph object
 2 | #'
 3 | #' Could be an actual graph object, or a graph such as the Twitter
 4 | #' following network defined implicitly via API requests, etc.
 5 | #' The abstract graph is just a list with `abstract_graph` class
 6 | #' and your desired subclass.
 7 | #'
 8 | #' @param subclass Desired subclass (character).
 9 | #' @param ... Other arguments to pass to `list()`. See
10 | #'   [rtweet_graph()] for an example.
11 | #'
12 | #' @export
13 | abstract_graph <- function(subclass, ...) {
14 |   graph <- list(...)
15 |   class(graph) <- c(subclass, "abstract_graph")
16 |   graph
17 | }
18 | 
19 | #' Check if a node an abstract graph is acceptable for inclusion in PPR
20 | #'
21 | #' Inclusion criteria:
22 | #'
23 | #'   - At least one outgoing edge
24 | #'   - Can get in degree and out degree of node
25 | #'   - Can get all nodes connected to `node` / the 1-hop neighborhood
26 | #'
27 | #' @param graph A graph object.
28 | #' @param nodes The name(s) of node(s) in `graph` as a character vector.
29 | #'
30 | #' @return The subset of `nodes` that are acceptable for inclusion. This
31 | #'   can be a character vector of length zero if necessary. It is critical
32 | #'   that no entries of `nodes` are duplicated in this output, so we
33 | #'   recommend calling `unique()` if there is any potential for repeats
34 | #'   in your checking good.
35 | #'
36 | #' @export
37 | check <- function(graph, nodes) {
38 |   UseMethod("check")
39 | }
40 | 
41 | #' Get the in-degree and out-degree of nodes in an abstract graph
42 | #'
43 | #' This function is only called nodes that have been [check()]'d. It is
44 | #' safe to assume that `nodes` is non-empty.
45 | #'
46 | #' @param graph A graph object.
47 | #' @param nodes The name(s) of node(s) in `graph` as a character vector.
48 | #'   Methods may assume that there are no repeated values in `nodes`.
49 | #'
50 | #' @return A [data.frame()] with one row for every node in `nodes` and
51 | #'   two columns: `in_degree` and `out_degree`. In a symmetric graph,
52 | #'   `in_degree` and `out_degree` should match.
53 | #'
54 | #' @export
55 | node_degrees <- function(graph, nodes) {
56 |   UseMethod("node_degrees")
57 | }
58 | 
59 | #' Get the neighborhood of a node in a graph
60 | #'
61 | #' That is, find all nodes connected to `node` by an outgoing edge.
62 | #' This function is memorized to avoid making repeated API queries.
63 | #'
64 | #' @param graph A graph object.
65 | #' @param node The name of a single node in `graph` as a character vector.
66 | #'
67 | #' @return A character vector of all nodes in `graph` connected such that
68 | #'   there is an outgoing edge for `node` to those nodes. This should
69 | #'   never be empty, as `neighborhood()` should not be called on nodes
70 | #'   that fail `check()`, and `check()` enforces that nodes have out-degree
71 | #'   of at least one. It is critical node names are duplicated in the
72 | #'   output recommend calling `unique()` if there is any potential for
73 | #'   for that to occur.
74 | #'
75 | #' @export
76 | neighborhood <- function(graph, node) {
77 | 
78 |   if (length(node) != 1)
79 |     stop("`node` must be a character vector of length 1L.", call. = FALSE)
80 | 
81 |   UseMethod("neighborhood")
82 | }
83 | 
84 | # memoized versions, these are what actually get used
85 | #' @importFrom memoise memoise
86 | memo_neighborhood <- memoise::memoise(neighborhood)
87 | 
88 | #' @method print abstract_graph
89 | #' @export
90 | print.abstract_graph <- function(x, ...) {
91 |   cat(glue("Abstract graph object (subclass: {class(x)[1]})\n"))
92 | }
93 | 


--------------------------------------------------------------------------------
/R/graph-igraph.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' @rdname appr
 3 | #' @export
 4 | appr.igraph <- function(graph, seeds, ...) {
 5 | 
 6 |   if (!requireNamespace("igraph", quietly = TRUE))
 7 |     stop("`igraph` package must be installed to use igraphs.", call. = FALSE)
 8 | 
 9 |   if (is.null(igraph::V(graph)$name))
10 |     igraph::V(graph)$name <- as.character(1:igraph::gorder(graph))
11 | 
12 |   appr.abstract_graph(graph = graph, seeds = seeds, ...)
13 | }
14 | 
15 | check.igraph <- function(graph, nodes) {
16 | 
17 |   node_names <- names(igraph::V(graph))
18 |   nodes_in_graph <- nodes[nodes %in% node_names]
19 | 
20 |   nodes_in_graph[igraph::degree(graph, v = nodes_in_graph, mode = "out") > 0]
21 | }
22 | 
23 | node_degrees.igraph <- function(graph, nodes) {
24 |   list(
25 |     in_degree = igraph::degree(graph, v = nodes, mode = "in"),
26 |     out_degree = igraph::degree(graph, v = nodes, mode = "out")
27 |   )
28 | }
29 | 
30 | # character list of neighboring nodes
31 | # treat directed vs undirected differently?
32 | neighborhood.igraph <- function(graph, node) {
33 |   int_node_list <- igraph::ego(
34 |     graph, nodes = node, mode = "out", mindist = 1
35 |   )
36 | 
37 |   nodes <- int_node_list[[1]]
38 |   igraph::V(graph)$name[nodes]
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/R/graph-rtweet.R:
--------------------------------------------------------------------------------
  1 | #' Create an abstract representation of the Twitter friendship graph
  2 | #'
  3 | #' Signifies that `aPPR` should query the Twitter friendship graph via
  4 | #' `rtweet`.
  5 | #'
  6 | #' @inheritParams rtweet::get_friends
  7 | #'
  8 | #' @export
  9 | rtweet_graph <- function(retryonratelimit = TRUE, verbose = TRUE, n = 5000) {
 10 | 
 11 |   if (!requireNamespace("rtweet", quietly = TRUE)) {
 12 |     stop(
 13 |       "`rtweet` package must be installed to use `rtweet_graph()`",
 14 |       call. = FALSE
 15 |     )
 16 |   }
 17 | 
 18 |   agraph <- abstract_graph(
 19 |     "rtweet_graph",
 20 |     retryonratelimit = retryonratelimit,
 21 |     verbose = verbose,
 22 |     max_friends = n
 23 |   )
 24 | 
 25 |   agraph
 26 | }
 27 | 
 28 | #' @rdname appr
 29 | #' @export
 30 | appr.rtweet_graph <- function(graph, seeds, ...) {
 31 | 
 32 |   seed_data <- rtweet::lookup_users(
 33 |     seeds,
 34 |     retryonratelimit = graph$retryonratelimit,
 35 |     verbose = graph$verbose
 36 |   )
 37 | 
 38 |   if (any(seed_data$protected)) {
 39 |     stop("Seed nodes should not be protected Twitter accounts.", call. = FALSE)
 40 |   }
 41 | 
 42 |   # convert seeds, potentially passed as screen names, to user ids
 43 |   seeds <- seed_data$id_str
 44 | 
 45 |   NextMethod()
 46 | }
 47 | 
 48 | # return character vector of all good nodes in the batch
 49 | #' @importFrom glue glue
 50 | check.rtweet_graph <- function(graph, nodes) {
 51 | 
 52 |   logger::log_debug(glue("Checking nodes"))
 53 |   logger::log_trace(glue("Checking nodes: {nodes}"))
 54 | 
 55 |   if (length(nodes) < 1)
 56 |     return(character(0))
 57 | 
 58 |   node_data <- rtweet::lookup_users(
 59 |     nodes,
 60 |     retryonratelimit = graph$retryonratelimit,
 61 |     verbose = graph$verbose
 62 |   )
 63 | 
 64 |   if (is.null(node_data) || nrow(node_data) < 1)
 65 |     return(character(0))
 66 | 
 67 |   good_nodes <- !node_data$protected & node_data$friends_count > 0
 68 | 
 69 |   logger::log_debug(glue("Done checking nodes"))
 70 | 
 71 |   node_data$id_str[good_nodes]
 72 | }
 73 | 
 74 | node_degrees.rtweet_graph <- function(graph, nodes) {
 75 | 
 76 |   logger::log_debug(glue("Getting node degrees"))
 77 |   logger::log_trace(glue("Getting node degrees for node: {nodes}"))
 78 | 
 79 |   # assumes that you want any errors / empty rows when accessing this
 80 |   # data, i.e. that the nodes have already been checked
 81 | 
 82 |   node_data <- rtweet::lookup_users(
 83 |     nodes,
 84 |     retryonratelimit = graph$retryonratelimit,
 85 |     verbose = graph$verbose
 86 |   )
 87 | 
 88 |   logger::log_debug(glue("Done getting node degrees"))
 89 | 
 90 |   list(
 91 |     in_degree = node_data$followers_count,
 92 |     out_degree = node_data$friends_count
 93 |   )
 94 | }
 95 | 
 96 | neighborhood.rtweet_graph <- function(graph, node) {
 97 | 
 98 |   logger::log_debug(glue("Getting neighborhood: {node}"))
 99 | 
100 |   # if a user doesn't follow anyone, safe_get_friends returns an empty
101 |   # tibble, but instead it should return an empty character vector?
102 |   friends <- rtweet::get_friends(
103 |     users = node,
104 |     n = graph$max_friends,
105 |     retryonratelimit = graph$retryonratelimit,
106 |     verbose = graph$verbose
107 |   )
108 | 
109 |   logger::log_debug(glue("Done getting neighborhood"))
110 | 
111 |   if (nrow(friends) < 1) character(0) else friends$to_id
112 | }
113 | 
114 | 


--------------------------------------------------------------------------------
/R/tracker.R:
--------------------------------------------------------------------------------
  1 | #' R6 class to manage personalized pagerank calculations
  2 | #'
  3 | #' @importFrom R6 R6Class
  4 | #'
  5 | Tracker <- R6Class("Tracker", list(
  6 | 
  7 |   #' @field seeds A character vector of the seed nodes.
  8 |   seeds = character(0),
  9 | 
 10 |   #' @field path A character vector of nodes whose neighborhoods we
 11 |   #'   examined.
 12 |   path = character(0),
 13 | 
 14 |   #' @field stats A [tibble::tibble()] with one row for each visited
 15 |   #'   node and the following columns:
 16 |   #'
 17 |   #'   - `name`: Name of a node (character).
 18 |   #'   - `r`: Current estimate of residual per out-degree for a node.
 19 |   #'   - `p`: Current estimate of the pagerank for a node.
 20 |   #'   - `in_degree`: Number of incoming edges to a node.
 21 |   #'   - `out_degree`: Number of outcoming edges from a node.
 22 |   #'
 23 |   stats = NULL,
 24 | 
 25 |   #' @field failed A character vector of nodes that could not be visited.
 26 |   failed = character(0),
 27 | 
 28 |   #' @field graph An abstract graph object.
 29 |   graph = NULL,
 30 | 
 31 |   #' @field alpha Teleportation constant from Algorithm 3.
 32 |   alpha = numeric(0),
 33 | 
 34 |   #' @field alpha_prime Transformed teleportation constant from Algorithm 3.
 35 |   alpha_prime = numeric(0),
 36 | 
 37 |   #' @field epsilon Error tolerance.
 38 |   epsilon = numeric(0),
 39 | 
 40 |   #' @field max_visits Maximum number of nodes to visit before terminating.
 41 |   max_visits = integer(0),
 42 | 
 43 |   #' @field tau Regularization parameter used in Algorithm 4.
 44 |   tau = numeric(0),
 45 | 
 46 |   #' @description
 47 |   #'
 48 |   #' Create a new Tracker object.
 49 |   #'
 50 |   #' @param graph See [appr()].
 51 |   #' @param alpha See [appr()].
 52 |   #' @param epsilon See [appr()].
 53 |   #' @param tau See [appr()].
 54 |   #' @param max_visits See [appr()].
 55 |   #'
 56 |   #' @return A new `Tracker` object.
 57 |   #'
 58 |   #' @importFrom tibble tibble
 59 |   #'
 60 |   initialize = function(graph, alpha, epsilon, tau, max_visits) {
 61 | 
 62 |     self$graph <- graph
 63 |     self$alpha <- alpha
 64 |     self$alpha_prime <- alpha / (2 - alpha)
 65 |     self$epsilon <- epsilon
 66 |     self$tau <- tau
 67 |     self$max_visits <- max_visits
 68 | 
 69 |     self$stats <- tibble::tibble(
 70 |       name = character(0),
 71 |       regularized = numeric(0),
 72 |       p = numeric(0),
 73 |       in_degree = numeric(0),
 74 |       out_degree = numeric(0),
 75 |       degree_adjusted = numeric(0),
 76 |       r = numeric(0)
 77 |     )
 78 |   },
 79 | 
 80 |   #' @description
 81 |   #'
 82 |   #' Print the tibble containing the current state of the pagerank
 83 |   #' calculation.
 84 |   #'
 85 |   print = function() {
 86 | 
 87 |     cat("Personalized PageRank Approximator\n")
 88 |     cat("----------------------------------\n\n")
 89 | 
 90 |     cat(glue("  - number of seeds: {length(self$seeds)}\n", .trim = FALSE))
 91 |     cat(glue("  - unique nodes visited so far: {length(unique(self$path))} out of maximum of {self$max_visits}\n", .trim = FALSE))
 92 |     cat(glue("  - total visits so far: {length(self$path)}\n", .trim = FALSE))
 93 |     cat(glue("  - bad nodes so far: {length(self$failed)}\n\n", .trim = FALSE))
 94 | 
 95 |     cat(glue("  - teleportation constant (alpha): {self$alpha}\n", .trim = FALSE))
 96 |     cat(glue("  - desired approximation error (epsilon): {self$epsilon}\n", .trim = FALSE))
 97 |     cat(glue("  - achieved bound on approximation error: {self$current_approximation_error()}\n", .trim = FALSE))
 98 |     cat(glue("  - length of to visit list: {length(self$remaining())}\n\n", .trim = FALSE))
 99 | 
100 |     cat(glue("PPR table (see $stats field):\n\n"))
101 | 
102 |     print(self$stats)
103 |     invisible(self)
104 |   },
105 | 
106 |   #' @description
107 |   #'
108 |   #' Determine nodes that need to be visited. Note that,
109 |   #' if there is a node with zero out degree, you will never
110 |   #' leave from that node. So it is important to make sure
111 |   #' we never add nodes with zero out degree into the tracker.
112 |   #'
113 |   #' @return A character vector of node names with current residuals
114 |   #'   greater than `epsilon`.
115 |   #'
116 |   remaining = function() {
117 | 
118 |     # when we initialize, we need to initialize to the seeds
119 |     # here we check for initialization by consider the path
120 |     # of nodes we've visited so far. it's very important that
121 |     # we do not populate `path` when adding the seeds
122 |     if (length(self$path) < 1)
123 |       return(self$seeds)
124 | 
125 |     self$stats[self$stats$r > self$epsilon * self$stats$out_degree, ]$name
126 |   },
127 | 
128 |   #' @description
129 |   #'
130 |   #' Determine current quality of approximation.
131 |   #'
132 |   #' @return A numeric vector of length one with the current worst
133 |   #'   error bound.
134 |   #'
135 |   current_approximation_error = function() {
136 | 
137 |     nodewise_approx_error <- self$stats$r / self$stats$out_degree
138 |     max(nodewise_approx_error)
139 |   },
140 | 
141 |   #' @description
142 |   #'
143 |   #' Check if there is already a row for a particular node
144 |   #'
145 |   #' @param nodes Character name of node(s) in the graph.
146 |   #'
147 |   #' @return `TRUE` if there is a row for `node`, `FALSE` if there
148 |   #'   is not a row for `node`.
149 |   #'
150 |   in_tracker = function(nodes) {
151 |     nodes %in% self$stats$name
152 |   },
153 | 
154 |   #' @description
155 |   #'
156 |   #' Check if we previously failed to visit a node
157 |   #'
158 |   #' @param node Character name of a node in the graph.
159 |   #'
160 |   #' @return `TRUE` if we failed to visit `node`, `FALSE` otherwise.
161 |   #'   Note that this function will return `FALSE` if `node` is new
162 |   #'   and we haven't seen it before.
163 |   #'
164 |   in_failed = function(node) {
165 |     node %in% self$failed
166 |   },
167 | 
168 |   #' @description
169 |   #'
170 |   #' Create an entry for `node` in the tracker. Assumes that
171 |   #' `node` is not in the tracker yet, and does not check if
172 |   #' this is the case.
173 |   #'
174 |   #' @param seeds The name of the node in the graph as a length 1
175 |   #'   character vector.
176 |   #'
177 |   #' @param preference TODO: recall what on earth this is.
178 |   #'
179 |   add_seed = function(seeds, preference) {
180 |     self$seeds <- c(self$seeds, seeds)
181 |     self$add_nodes(nodes = seeds, preference = preference)
182 |   },
183 | 
184 |   #' @description
185 |   #'
186 |   #' TODO
187 |   #'
188 |   #' @param node The name of the node in the graph as a length 1
189 |   #'   character vector.
190 |   #'
191 |   add_to_path = function(node) {
192 |     self$path <- c(self$path, node)
193 |   },
194 | 
195 |   #' @description
196 |   #'
197 |   #' Create an entry for `node` in the tracker. Assumes that
198 |   #' `node` is not in the tracker yet, and does not check if
199 |   #' this is the case.
200 |   #'
201 |   #' @param nodes The name(s) of node(s) in the graph as a character vector.
202 |   #'
203 |   #' @param preference TODO: recall what on earth this is.
204 |   #'
205 |   add_nodes = function(nodes, preference = 0) {
206 | 
207 |     log_trace(glue("Adding node(s) to tracker: {nodes}"))
208 | 
209 |     degree <- node_degrees(self$graph, nodes)
210 | 
211 |     self$stats <- tibble::add_row(
212 |       self$stats,
213 |       name = nodes,
214 |       regularized = NA_real_,
215 |       p = 0,
216 |       in_degree = degree$in_degree,
217 |       out_degree = degree$out_degree,
218 |       degree_adjusted = NA_real_,
219 |       r = preference
220 |     )
221 | 
222 |   },
223 | 
224 |   #' @description
225 |   #'
226 |   #' Add `node` to the list of nodes we failed to visit.
227 |   #' Assumes that `node` is not in the failed list yet, and
228 |   #' does not check if this is the case.
229 |   #'
230 |   #' @param nodes The name of the node in the graph as a length 1
231 |   #'   character vector.
232 |   #'
233 |   add_failed = function(nodes) {
234 |     self$failed <- c(self$failed, nodes)
235 |   },
236 | 
237 |   #' @description
238 |   #'
239 |   #' Update the estimate of the personalized pagerank for a given node
240 |   #'
241 |   #' @param node Character name of a node in the graph.
242 |   #'
243 |   update_p = function(node) {
244 | 
245 |     node_index <- which(self$stats$name == node)
246 |     self$stats[[node_index, "p"]] <- self$stats[[node_index, "p"]] +
247 |       self$alpha_prime * self$stats[[node_index, "r"]]
248 |   },
249 | 
250 |   #' @description
251 |   #'
252 |   #' Update the residual of a *good* node in the neighborhood of
253 |   #' the current node, adding it to the tracker if necessary
254 |   #'
255 |   #' @param u Character name of the node we are currently visiting.
256 |   #' @param v Names of neighbors of `u` as a character vector. Can
257 |   #'   contain multiple elements. Can also contain zero elements.
258 |   #'
259 |   update_r_neighbor = function(u, v) {
260 | 
261 |     log_trace(glue("update_r_neighbor({u}, {v})"))
262 | 
263 |     stopifnot(length(u) == 1)
264 | 
265 |     if (length(v) < 1)
266 |       return(invisible(NULL))
267 | 
268 |     new_nodes <- v[!self$in_tracker(v)]
269 | 
270 |     if (length(new_nodes) > 0)
271 |       self$add_nodes(new_nodes)
272 | 
273 |     u_index <- which(self$stats$name == u)
274 |     v_index <- match(v, self$stats$name)
275 | 
276 |     self$stats[v_index, "r"] <- self$stats[v_index, "r"] +
277 |       (1 - self$alpha_prime) * self$stats[[u_index, "r"]] /
278 |       (2 * self$stats[[u_index, "out_degree"]])
279 | 
280 |   },
281 | 
282 |   #' @description
283 |   #'
284 |   #' Update the residual of current node
285 |   #'
286 |   #' @param node Character name of the node we are currently visiting.
287 |   #'
288 |   update_r_self = function(node) {
289 |     node_index <- which(self$stats$name == node)
290 |     self$stats[[node_index, "r"]] <- (1 - self$alpha_prime) *
291 |       self$stats[[node_index, "r"]] / 2
292 |   },
293 | 
294 |   #' @description
295 |   #'
296 |   #' Compute the degree-adjusted and regularized variants of personalized
297 |   #' PageRank as in Algorithm 4, based on the outputs of Algorithm 3.
298 |   #'
299 |   #' @param node Character name of the node we are currently visiting.
300 |   #'
301 |   regularize = function() {
302 | 
303 |     if (is.null(self$tau)) {
304 |       tau <- mean(self$stats$in_degree)
305 |     }
306 | 
307 |     # might divide by 0 here
308 |     self$stats$degree_adjusted <- self$stats$p / self$stats$in_degree
309 |     self$stats$regularized <- self$stats$p / (self$stats$in_degree + tau)
310 |   },
311 | 
312 |   #' @description
313 |   #'
314 |   #' Main driver function to perform the computations outlined in
315 |   #' Algorithm 3.
316 |   #'
317 |   #' @param node Character name of the node we are currently visiting.
318 |   #'
319 |   calculate_ppr = function() {
320 | 
321 |     log_info("Approximating PPR ...")
322 | 
323 |     remaining <- self$remaining()
324 |     unique_visits_so_far <- length(unique(self$path))
325 | 
326 |     log_info(glue(
327 |       "Visits: {length(self$path)} total / ",
328 |       "{unique_visits_so_far} unique (max {self$max_visits}) / ",
329 |       "{length(remaining)} to visit / ",
330 |       "current epsilon: {self$current_approximation_error()}.",
331 |       .trim = FALSE
332 |     ))
333 | 
334 |     while (length(remaining) > 0) {
335 | 
336 |       if (unique_visits_so_far >= self$max_visits) {
337 |         warning("Maximum visits reached. Finishing aPPR calculation early.", call. = FALSE)
338 |         break
339 |       }
340 | 
341 |       u <- if (length(remaining) == 1) remaining else sample(remaining, size = 1)
342 | 
343 |       log_trace(glue("Visting {u}"))
344 | 
345 |       self$update_p(u)
346 | 
347 |       # here we come into contact with reality and must depart from the
348 |       # warm embrace of algorithm 3
349 | 
350 |       # this is where we learn about new nodes. there are two kinds of new
351 |       # nodes: "good" nodes that we can visit, and "bad" nodes that we can't
352 |       # visit, such as protected Twitter accounts or nodes that the API fails
353 |       # to get for some reason. we want to:
354 |       #
355 |       #   - update the good nodes are we typically would
356 |       #   - pretend the bad nodes don't exist
357 |       #
358 |       # also note that we only want to *check* each node once
359 | 
360 |       neighbors <- memo_neighborhood(self$graph, u)
361 | 
362 |       self$add_to_path(u)
363 | 
364 |       # first deal with the good neighbors we've already seen all
365 |       # at once
366 | 
367 |       known_good <- neighbors[self$in_tracker(neighbors)]
368 |       known_bad <- neighbors[self$in_failed(neighbors)]
369 | 
370 |       unknown <- setdiff(neighbors, c(known_good, known_bad))
371 | 
372 |       new_good <- check(self$graph, unknown)
373 |       new_bad <- setdiff(unknown, new_good)
374 | 
375 |       log_debug(
376 |         glue(
377 |           "{length(known_good)} known good / ",
378 |           "{length(known_bad)} known bad / ",
379 |           "{length(new_good)} new good / ",
380 |           "{length(new_bad)} new bad",
381 |           sep = " "
382 |         )
383 |       )
384 | 
385 |       log_trace(glue("known good: {known_good}"))
386 |       log_trace(glue("known bad: {known_bad}"))
387 |       log_trace(glue("new good: {new_good}"))
388 |       log_trace(glue("new bad: {new_bad}"))
389 | 
390 |       self$add_failed(new_bad)
391 |       self$update_r_neighbor(u, known_good)
392 |       self$update_r_neighbor(u, new_good)
393 | 
394 |       self$update_r_self(u)
395 | 
396 |       remaining <- self$remaining()
397 |       unique_visits_so_far <- length(unique(self$path))
398 | 
399 |       log_info(glue(
400 |         "Visits: {length(self$path)} total / ",
401 |         "{unique_visits_so_far} unique (max {self$max_visits}) / ",
402 |         "{length(remaining)} to visit / ",
403 |         "current epsilon: {self$current_approximation_error()}.",
404 |         .trim = FALSE
405 |       ))
406 |     }
407 | 
408 |     log_info("Approximating PPR ... done")
409 |   }
410 | ))
411 | 


--------------------------------------------------------------------------------
/R/update.R:
--------------------------------------------------------------------------------
 1 | #' Update a Tracker object
 2 | #'
 3 | #' Typically because results are insufficiently precise.
 4 | #'
 5 | #' At the moment, only supports changing `epsilon`. If there is interest,
 6 | #' we can consider allowing updates to `tau`, `alpha` and `seeds` in the
 7 | #' future.
 8 | #'
 9 | #' @param object The `Tracker` object to update.
10 | #'
11 | #' @inheritParams appr
12 | #'
13 | #' @return A new `Tracker` object with a new value of `epsilon`.
14 | #' @export
15 | #'
16 | update.Tracker <- function(object, ..., epsilon, max_visits) {
17 | 
18 |   object$epsilon <- epsilon
19 |   object$max_visits <- max_visits
20 |   object$calculate_ppr()
21 |   object$regularize()
22 |   object
23 | }
24 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: github_document
  3 | ---
  4 | 
  5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  6 | 
  7 | ```{r, include = FALSE}
  8 | knitr::opts_chunk$set(
  9 |   collapse = TRUE,
 10 |   comment = "#>",
 11 |   fig.path = "man/figures/README-",
 12 |   out.width = "100%",
 13 |   error = TRUE
 14 | )
 15 | ```
 16 | 
 17 | # aPPR
 18 | 
 19 | <!-- badges: start -->
 20 | [![R-CMD-check](https://github.com/RoheLab/aPPR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/RoheLab/aPPR/actions/workflows/R-CMD-check.yaml)
 21 | [![Codecov test coverage](https://codecov.io/gh/RoheLab/aPPR/branch/main/graph/badge.svg)](https://app.codecov.io/gh/RoheLab/aPPR?branch=main)
 22 | 
 23 | <!-- badges: end -->
 24 | 
 25 | `aPPR` helps you calculate approximate personalized pageranks from large graphs, including those that can only be queried via an API. `aPPR` additionally performs degree correction and regularization, allowing you to recover blocks from stochastic blockmodels.
 26 | 
 27 | To learn more about `aPPR` you can:
 28 | 
 29 | 1. Glance through slides from the [JSM2021](https://github.com/alexpghayes/JSM2021) talk
 30 | 2. Read the accompanying [paper][chen]
 31 | 
 32 | ### Installation
 33 | 
 34 | You can install the development version from [GitHub](https://github.com/) with:
 35 | 
 36 | ``` r
 37 | # install.packages("devtools")
 38 | devtools::install_github("RoheLab/aPPR")
 39 | ```
 40 | 
 41 | ### Find the personalized pagerank of a node in an `igraph` graph
 42 | 
 43 | ```{r igraph-example, message = FALSE}
 44 | library(aPPR)
 45 | library(igraph)
 46 | 
 47 | set.seed(27)
 48 | 
 49 | erdos_renyi_graph <- sample_gnp(n = 100, p = 0.5)
 50 | 
 51 | erdos_tracker <- appr(
 52 |   erdos_renyi_graph,   # the graph to work with
 53 |   seeds = "5",         # name of seed node (character)
 54 |   epsilon = 0.0005     # desired approximation quality (see ?appr)
 55 | )
 56 | 
 57 | erdos_tracker
 58 | ```
 59 | 
 60 | You can access the Personalized PageRanks themselves via the `stats` field of `Tracker` objects.
 61 | 
 62 | ```{r}
 63 | erdos_tracker$stats
 64 | ```
 65 | 
 66 | Sometimes you may wish to limit computation time by limiting the number of nodes to visit, which you can do as follows:
 67 | 
 68 | ```{r igraph-example2}
 69 | limited_visits_tracker <- appr(
 70 |   erdos_renyi_graph,   
 71 |   seeds = "5",         
 72 |   epsilon = 1e-10,     
 73 |   max_visits = 20      # max unique nodes to visit during approximation
 74 | )
 75 | 
 76 | limited_visits_tracker
 77 | ```
 78 | 
 79 | ### Find the personalized pagerank of a Twitter user using `rtweet`
 80 | 
 81 | ```{r rtweet-example}
 82 | ftrevorc_ppr <- appr(
 83 |   rtweet_graph(),
 84 |   "ftrevorc",
 85 |   epsilon = 1e-4,
 86 |   max_visits = 5
 87 | )
 88 | 
 89 | ftrevorc_ppr
 90 | ```
 91 | 
 92 | ### Logging
 93 | 
 94 | `aPPR` uses [`logger`](https://daroczig.github.io/logger/) for displaying information to the user. By default, `aPPR` is quite verbose. You can control verbosity by loading `logger` and setting the logging threshold.
 95 | 
 96 | ```{r logging-example-1, eval = FALSE}
 97 | library(logger)
 98 | 
 99 | # hide basically all messages (not recommended)
100 | log_threshold(FATAL, namespace = "aPPR")
101 | 
102 | appr(
103 |   erdos_renyi_graph,   # the graph to work with
104 |   seeds = "5",         # name of seed node (character)
105 |   epsilon = 0.0005     # desired approximation quality (see ?appr)
106 | )
107 | ```
108 | 
109 | If you submit a bug report, please please please include a log file using the TRACE threshold. You can set up this kind of detailed logging via the following:
110 | 
111 | ```{r log-file-example, eval = FALSE}
112 | 
113 | set.seed(528491)  # be sure to set seed for bug reports
114 | 
115 | log_appender(
116 |   appender_file(
117 |     "/path/to/logfile.log"  ## TODO: choose a path to log to
118 |   ),
119 |   namespace = "aPPR"
120 | )
121 | 
122 | log_threshold(TRACE, namespace = "aPPR")
123 | 
124 | tracker <- appr(
125 |   rtweet_graph(),
126 |   seed = c("hadleywickham", "gvanrossum"),
127 |   epsilon = 1e-6
128 | )
129 | ```
130 | 
131 | ### Ethical considerations
132 | 
133 | People have a right to choose how public and discoverable their information is. `aPPR` will often lead you to accounts that interesting, but also small and out of sight. Do not change the public profile or attention towards these the people running these accounts, or any other accounts, without their permission.
134 | 
135 | ### References
136 | 
137 | 1. Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349. [arxiv][chen]
138 | 
139 | 2. Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44.
140 | 
141 | [chen]: https://arxiv.org/abs/1910.12937
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
  3 | 
  4 | # aPPR
  5 | 
  6 | <!-- badges: start -->
  7 | 
  8 | [![R-CMD-check](https://github.com/RoheLab/aPPR/workflows/R-CMD-check/badge.svg)](https://github.com/RoheLab/aPPR/actions)
  9 | [![Codecov test
 10 | coverage](https://codecov.io/gh/RoheLab/aPPR/branch/main/graph/badge.svg)](https://app.codecov.io/gh/RoheLab/aPPR?branch=main)
 11 | <!-- badges: end -->
 12 | 
 13 | `aPPR` helps you calculate approximate personalized pageranks from large
 14 | graphs, including those that can only be queried via an API. `aPPR`
 15 | additionally performs degree correction and regularization, allowing you
 16 | to recover blocks from stochastic blockmodels.
 17 | 
 18 | To learn more about `aPPR` you can:
 19 | 
 20 | 1.  Glance through slides from the
 21 |     [JSM2021](https://github.com/alexpghayes/JSM2021) talk
 22 | 2.  Read the accompanying [paper](https://arxiv.org/abs/1910.12937)
 23 | 
 24 | ### Installation
 25 | 
 26 | You can install the development version from
 27 | [GitHub](https://github.com/) with:
 28 | 
 29 | ``` r
 30 | # install.packages("devtools")
 31 | devtools::install_github("RoheLab/aPPR")
 32 | ```
 33 | 
 34 | ### Find the personalized pagerank of a node in an `igraph` graph
 35 | 
 36 | ``` r
 37 | library(aPPR)
 38 | library(igraph)
 39 | 
 40 | set.seed(27)
 41 | 
 42 | erdos_renyi_graph <- sample_gnp(n = 100, p = 0.5)
 43 | 
 44 | erdos_tracker <- appr(
 45 |   erdos_renyi_graph,   # the graph to work with
 46 |   seeds = "5",         # name of seed node (character)
 47 |   epsilon = 0.0005     # desired approximation quality (see ?appr)
 48 | )
 49 | 
 50 | erdos_tracker
 51 | #> Personalized PageRank Approximator
 52 | #> ----------------------------------
 53 | #> 
 54 | #>   - number of seeds: 1
 55 | #>   - visits so far: 5
 56 | #>   - unique nodes visited so far: 1 out of maximum of Inf
 57 | #>   - bad nodes so far: 0
 58 | #> 
 59 | #>   - teleportation constant (alpha): 0.15
 60 | #>   - desired approximation error (epsilon): 5e-04
 61 | #>   - achieved bound on approximation error: 0.000416297883029663
 62 | #>   - current length of to-visit list: 0
 63 | #> 
 64 | #> PPR table (see $stats field):
 65 | #> # A tibble: 51 × 7
 66 | #>    name       r     p in_degree out_degree degree_adjusted regularized
 67 | #>    <chr>  <dbl> <dbl>     <dbl>      <dbl>           <dbl>       <dbl>
 68 | #>  1 5     0.0205 0.147        50         50         0.00294     0.00147
 69 | #>  2 3     0.0167 0            51         51         0           0      
 70 | #>  3 6     0.0167 0            59         59         0           0      
 71 | #>  4 8     0.0167 0            41         41         0           0      
 72 | #>  5 15    0.0167 0            46         46         0           0      
 73 | #>  6 16    0.0167 0            52         52         0           0      
 74 | #>  7 17    0.0167 0            48         48         0           0      
 75 | #>  8 19    0.0167 0            54         54         0           0      
 76 | #>  9 20    0.0167 0            51         51         0           0      
 77 | #> 10 21    0.0167 0            55         55         0           0      
 78 | #> # … with 41 more rows
 79 | ```
 80 | 
 81 | You can access the Personalized PageRanks themselves via the `stats`
 82 | field of `Tracker` objects.
 83 | 
 84 | ``` r
 85 | erdos_tracker$stats
 86 | #> # A tibble: 51 × 7
 87 | #>    name       r     p in_degree out_degree degree_adjusted regularized
 88 | #>    <chr>  <dbl> <dbl>     <dbl>      <dbl>           <dbl>       <dbl>
 89 | #>  1 5     0.0205 0.147        50         50         0.00294     0.00147
 90 | #>  2 3     0.0167 0            51         51         0           0      
 91 | #>  3 6     0.0167 0            59         59         0           0      
 92 | #>  4 8     0.0167 0            41         41         0           0      
 93 | #>  5 15    0.0167 0            46         46         0           0      
 94 | #>  6 16    0.0167 0            52         52         0           0      
 95 | #>  7 17    0.0167 0            48         48         0           0      
 96 | #>  8 19    0.0167 0            54         54         0           0      
 97 | #>  9 20    0.0167 0            51         51         0           0      
 98 | #> 10 21    0.0167 0            55         55         0           0      
 99 | #> # … with 41 more rows
100 | ```
101 | 
102 | Sometimes you may wish to limit computation time by limiting the number
103 | of nodes to visit, which you can do as follows:
104 | 
105 | ``` r
106 | limited_visits_tracker <- appr(
107 |   erdos_renyi_graph,   
108 |   seeds = "5",         
109 |   epsilon = 1e-10,     
110 |   max_visits = 20      # max unique nodes to visit during approximation
111 | )
112 | #> Warning: Maximum visits reached. Finishing aPPR calculation early.
113 | limited_visits_tracker
114 | #> Personalized PageRank Approximator
115 | #> ----------------------------------
116 | #> 
117 | #>   - number of seeds: 1
118 | #>   - visits so far: 22
119 | #>   - unique nodes visited so far: 20 out of maximum of 20
120 | #>   - bad nodes so far: 0
121 | #> 
122 | #>   - teleportation constant (alpha): 0.15
123 | #>   - desired approximation error (epsilon): 1e-10
124 | #>   - achieved bound on approximation error: 0.00423832387327568
125 | #>   - current length of to-visit list: 100
126 | #> 
127 | #> PPR table (see $stats field):
128 | #> # A tibble: 100 × 7
129 | #>    name       r     p in_degree out_degree degree_adjusted regularized
130 | #>    <chr>  <dbl> <dbl>     <dbl>      <dbl>           <dbl>       <dbl>
131 | #>  1 5     0.212  0.118        50         50         0.00237     0.00119
132 | #>  2 3     0.0140 0            51         51         0           0      
133 | #>  3 6     0.0140 0            59         59         0           0      
134 | #>  4 8     0.0140 0            41         41         0           0      
135 | #>  5 15    0.0136 0            46         46         0           0      
136 | #>  6 16    0.0138 0            52         52         0           0      
137 | #>  7 17    0.0138 0            48         48         0           0      
138 | #>  8 19    0.0137 0            54         54         0           0      
139 | #>  9 20    0.0135 0            51         51         0           0      
140 | #> 10 21    0.0138 0            55         55         0           0      
141 | #> # … with 90 more rows
142 | ```
143 | 
144 | ### Find the personalized pagerank of a Twitter user using `rtweet`
145 | 
146 | ``` r
147 | ftrevorc_ppr <- appr(
148 |   rtweet_graph(),
149 |   "ftrevorc",
150 |   epsilon = 1e-4,
151 |   max_visits = 5
152 | )
153 | #> Warning: Maximum visits reached. Finishing aPPR calculation early.
154 | ftrevorc_ppr
155 | #> Personalized PageRank Approximator
156 | #> ----------------------------------
157 | #> 
158 | #>   - number of seeds: 1
159 | #>   - visits so far: 6
160 | #>   - unique nodes visited so far: 5 out of maximum of 5
161 | #>   - bad nodes so far: 10
162 | #> 
163 | #>   - teleportation constant (alpha): 0.15
164 | #>   - desired approximation error (epsilon): 1e-04
165 | #>   - achieved bound on approximation error: 0.00175980395529336
166 | #>   - current length of to-visit list: 5
167 | #> 
168 | #> PPR table (see $stats field):
169 | #> # A tibble: 210 × 7
170 | #>    name                 r     p in_degree out_degree degree_adjusted regularized
171 | #>    <chr>            <dbl> <dbl>     <dbl>      <dbl>           <dbl>       <dbl>
172 | #>  1 7752257741314… 0.211   0.118        69        120         0.00172     5.50e-8
173 | #>  2 17163639       0.00559 0         20033       1596         0           0      
174 | #>  3 9381208958721… 0.00559 0           372        179         0           0      
175 | #>  4 1359003756063… 0.00559 0           230        116         0           0      
176 | #>  5 76228303       0.00559 0          7253       2274         0           0      
177 | #>  6 1024298722828… 0.00559 0           382        829         0           0      
178 | #>  7 1264590946144… 0.00559 0           116        189         0           0      
179 | #>  8 1107711818997… 0.00559 0          3404        410         0           0      
180 | #>  9 1217315090     0.00559 0         20660        402         0           0      
181 | #> 10 1120701503763… 0.00559 0           354        243         0           0      
182 | #> # … with 200 more rows
183 | ```
184 | 
185 | ### Logging
186 | 
187 | `aPPR` uses [`logger`](https://daroczig.github.io/logger/) for
188 | displaying information to the user. By default, `aPPR` is quite verbose.
189 | You can control verbosity by loading `logger` and setting the logging
190 | threshold.
191 | 
192 | ``` r
193 | library(logger)
194 | 
195 | # hide basically all messages (not recommended)
196 | log_threshold(FATAL, namespace = "aPPR")
197 | 
198 | appr(
199 |   erdos_renyi_graph,   # the graph to work with
200 |   seeds = "5",         # name of seed node (character)
201 |   epsilon = 0.0005     # desired approximation quality (see ?appr)
202 | )
203 | ```
204 | 
205 | If you submit a bug report, please please please include a log file
206 | using the TRACE threshold. You can set up this kind of detailed logging
207 | via the following:
208 | 
209 | ``` r
210 | set.seed(528491)  # be sure to set seed for bug reports
211 | 
212 | log_appender(
213 |   appender_file(
214 |     "/path/to/logfile.log"  ## TODO: choose a path to log to
215 |   ),
216 |   namespace = "aPPR"
217 | )
218 | 
219 | log_threshold(TRACE, namespace = "aPPR")
220 | 
221 | tracker <- appr(
222 |   rtweet_graph(),
223 |   seed = c("hadleywickham", "gvanrossum"),
224 |   epsilon = 1e-6
225 | )
226 | ```
227 | 
228 | ### Ethical considerations
229 | 
230 | People have a right to choose how public and discoverable their
231 | information is. `aPPR` will often lead you to accounts that interesting,
232 | but also small and out of sight. Do not change the public profile or
233 | attention towards these the people running these accounts, or any other
234 | accounts, without their permission.
235 | 
236 | ### References
237 | 
238 | 1.  Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from
239 |     Massive Block Model Graphs with Personalized PageRank.” Journal of
240 |     the Royal Statistical Society: Series B (Statistical Methodology)
241 |     82, no. 1 (February 2020): 99–126.
242 |     <https://doi.org/10.1111/rssb.12349>.
243 |     [arxiv](https://arxiv.org/abs/1910.12937)
244 | 
245 | 2.  Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning
246 |     Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on
247 |     Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA,
248 |     USA: IEEE, 2006. <https://doi.org/10.1109/FOCS.2006.44>.
249 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | authors:
 2 |   Alex Hayes:
 3 |     href: https://alexpghayes.com
 4 | 
 5 | development:
 6 |   mode: auto
 7 | 
 8 | template:
 9 |   bootstrap: 5
10 |   params:
11 |     bootswatch: flatly
12 | 
13 | 
14 | reference:
15 | - title: "Define and interact with graphs"
16 |   contents:
17 |   - abstract_graph
18 |   - node_degrees
19 |   - check
20 |   - neighborhood
21 | - title: "Compute general aPPR results"
22 |   contents:
23 |   - appr
24 |   - contains("Tracker")
25 | - title: "Compute Personalized PageRanks of Twitter users"
26 |   contents:
27 |   - contains("rtweet")
28 | 


--------------------------------------------------------------------------------
/aPPR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## R CMD check results
 2 | 
 3 | 0 errors | 0 warnings | 1 note
 4 | 
 5 |  New submission
 6 |   
 7 |   Version contains large components (0.0.0.9102)
 8 |   
 9 |   Possibly mis-spelled words in DESCRIPTION:
10 |     PageRank (2:33, 11:66)
11 |   
12 |   Unknown, possibly mis-spelled, fields in DESCRIPTION:
13 |     ‘Remotes’
14 |   
15 |   Package has a VignetteBuilder field but no prebuilt vignette index.
16 | 
17 | * This is a new release.
18 | 


--------------------------------------------------------------------------------
/man/Tracker.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/tracker.R
  3 | \name{Tracker}
  4 | \alias{Tracker}
  5 | \title{R6 class to manage personalized pagerank calculations}
  6 | \description{
  7 | R6 class to manage personalized pagerank calculations
  8 | 
  9 | R6 class to manage personalized pagerank calculations
 10 | }
 11 | \section{Public fields}{
 12 | \if{html}{\out{<div class="r6-fields">}}
 13 | \describe{
 14 | \item{\code{seeds}}{A character vector of the seed nodes.}
 15 | 
 16 | \item{\code{path}}{A character vector of nodes whose neighborhoods we
 17 | examined.}
 18 | 
 19 | \item{\code{stats}}{A \code{\link[tibble:tibble]{tibble::tibble()}} with one row for each visited
 20 | node and the following columns:
 21 | \itemize{
 22 | \item \code{name}: Name of a node (character).
 23 | \item \code{r}: Current estimate of residual per out-degree for a node.
 24 | \item \code{p}: Current estimate of the pagerank for a node.
 25 | \item \code{in_degree}: Number of incoming edges to a node.
 26 | \item \code{out_degree}: Number of outcoming edges from a node.
 27 | }}
 28 | 
 29 | \item{\code{failed}}{A character vector of nodes that could not be visited.}
 30 | 
 31 | \item{\code{graph}}{An abstract graph object.}
 32 | 
 33 | \item{\code{alpha}}{Teleportation constant from Algorithm 3.}
 34 | 
 35 | \item{\code{alpha_prime}}{Transformed teleportation constant from Algorithm 3.}
 36 | 
 37 | \item{\code{epsilon}}{Error tolerance.}
 38 | 
 39 | \item{\code{max_visits}}{Maximum number of nodes to visit before terminating.}
 40 | 
 41 | \item{\code{tau}}{Regularization parameter used in Algorithm 4.}
 42 | }
 43 | \if{html}{\out{</div>}}
 44 | }
 45 | \section{Methods}{
 46 | \subsection{Public methods}{
 47 | \itemize{
 48 | \item \href{#method-Tracker-new}{\code{Tracker$new()}}
 49 | \item \href{#method-Tracker-print}{\code{Tracker$print()}}
 50 | \item \href{#method-Tracker-remaining}{\code{Tracker$remaining()}}
 51 | \item \href{#method-Tracker-current_approximation_error}{\code{Tracker$current_approximation_error()}}
 52 | \item \href{#method-Tracker-in_tracker}{\code{Tracker$in_tracker()}}
 53 | \item \href{#method-Tracker-in_failed}{\code{Tracker$in_failed()}}
 54 | \item \href{#method-Tracker-add_seed}{\code{Tracker$add_seed()}}
 55 | \item \href{#method-Tracker-add_to_path}{\code{Tracker$add_to_path()}}
 56 | \item \href{#method-Tracker-add_nodes}{\code{Tracker$add_nodes()}}
 57 | \item \href{#method-Tracker-add_failed}{\code{Tracker$add_failed()}}
 58 | \item \href{#method-Tracker-update_p}{\code{Tracker$update_p()}}
 59 | \item \href{#method-Tracker-update_r_neighbor}{\code{Tracker$update_r_neighbor()}}
 60 | \item \href{#method-Tracker-update_r_self}{\code{Tracker$update_r_self()}}
 61 | \item \href{#method-Tracker-regularize}{\code{Tracker$regularize()}}
 62 | \item \href{#method-Tracker-calculate_ppr}{\code{Tracker$calculate_ppr()}}
 63 | \item \href{#method-Tracker-clone}{\code{Tracker$clone()}}
 64 | }
 65 | }
 66 | \if{html}{\out{<hr>}}
 67 | \if{html}{\out{<a id="method-Tracker-new"></a>}}
 68 | \if{latex}{\out{\hypertarget{method-Tracker-new}{}}}
 69 | \subsection{Method \code{new()}}{
 70 | Create a new Tracker object.
 71 | \subsection{Usage}{
 72 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$new(graph, alpha, epsilon, tau, max_visits)}\if{html}{\out{</div>}}
 73 | }
 74 | 
 75 | \subsection{Arguments}{
 76 | \if{html}{\out{<div class="arguments">}}
 77 | \describe{
 78 | \item{\code{graph}}{See \code{\link[=appr]{appr()}}.}
 79 | 
 80 | \item{\code{alpha}}{See \code{\link[=appr]{appr()}}.}
 81 | 
 82 | \item{\code{epsilon}}{See \code{\link[=appr]{appr()}}.}
 83 | 
 84 | \item{\code{tau}}{See \code{\link[=appr]{appr()}}.}
 85 | 
 86 | \item{\code{max_visits}}{See \code{\link[=appr]{appr()}}.}
 87 | }
 88 | \if{html}{\out{</div>}}
 89 | }
 90 | \subsection{Returns}{
 91 | A new \code{Tracker} object.
 92 | }
 93 | }
 94 | \if{html}{\out{<hr>}}
 95 | \if{html}{\out{<a id="method-Tracker-print"></a>}}
 96 | \if{latex}{\out{\hypertarget{method-Tracker-print}{}}}
 97 | \subsection{Method \code{print()}}{
 98 | Print the tibble containing the current state of the pagerank
 99 | calculation.
100 | \subsection{Usage}{
101 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$print()}\if{html}{\out{</div>}}
102 | }
103 | 
104 | }
105 | \if{html}{\out{<hr>}}
106 | \if{html}{\out{<a id="method-Tracker-remaining"></a>}}
107 | \if{latex}{\out{\hypertarget{method-Tracker-remaining}{}}}
108 | \subsection{Method \code{remaining()}}{
109 | Determine nodes that need to be visited. Note that,
110 | if there is a node with zero out degree, you will never
111 | leave from that node. So it is important to make sure
112 | we never add nodes with zero out degree into the tracker.
113 | \subsection{Usage}{
114 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$remaining()}\if{html}{\out{</div>}}
115 | }
116 | 
117 | \subsection{Returns}{
118 | A character vector of node names with current residuals
119 | greater than \code{epsilon}.
120 | }
121 | }
122 | \if{html}{\out{<hr>}}
123 | \if{html}{\out{<a id="method-Tracker-current_approximation_error"></a>}}
124 | \if{latex}{\out{\hypertarget{method-Tracker-current_approximation_error}{}}}
125 | \subsection{Method \code{current_approximation_error()}}{
126 | Determine current quality of approximation.
127 | \subsection{Usage}{
128 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$current_approximation_error()}\if{html}{\out{</div>}}
129 | }
130 | 
131 | \subsection{Returns}{
132 | A numeric vector of length one with the current worst
133 | error bound.
134 | }
135 | }
136 | \if{html}{\out{<hr>}}
137 | \if{html}{\out{<a id="method-Tracker-in_tracker"></a>}}
138 | \if{latex}{\out{\hypertarget{method-Tracker-in_tracker}{}}}
139 | \subsection{Method \code{in_tracker()}}{
140 | Check if there is already a row for a particular node
141 | \subsection{Usage}{
142 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$in_tracker(nodes)}\if{html}{\out{</div>}}
143 | }
144 | 
145 | \subsection{Arguments}{
146 | \if{html}{\out{<div class="arguments">}}
147 | \describe{
148 | \item{\code{nodes}}{Character name of node(s) in the graph.}
149 | }
150 | \if{html}{\out{</div>}}
151 | }
152 | \subsection{Returns}{
153 | \code{TRUE} if there is a row for \code{node}, \code{FALSE} if there
154 | is not a row for \code{node}.
155 | }
156 | }
157 | \if{html}{\out{<hr>}}
158 | \if{html}{\out{<a id="method-Tracker-in_failed"></a>}}
159 | \if{latex}{\out{\hypertarget{method-Tracker-in_failed}{}}}
160 | \subsection{Method \code{in_failed()}}{
161 | Check if we previously failed to visit a node
162 | \subsection{Usage}{
163 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$in_failed(node)}\if{html}{\out{</div>}}
164 | }
165 | 
166 | \subsection{Arguments}{
167 | \if{html}{\out{<div class="arguments">}}
168 | \describe{
169 | \item{\code{node}}{Character name of a node in the graph.}
170 | }
171 | \if{html}{\out{</div>}}
172 | }
173 | \subsection{Returns}{
174 | \code{TRUE} if we failed to visit \code{node}, \code{FALSE} otherwise.
175 | Note that this function will return \code{FALSE} if \code{node} is new
176 | and we haven't seen it before.
177 | }
178 | }
179 | \if{html}{\out{<hr>}}
180 | \if{html}{\out{<a id="method-Tracker-add_seed"></a>}}
181 | \if{latex}{\out{\hypertarget{method-Tracker-add_seed}{}}}
182 | \subsection{Method \code{add_seed()}}{
183 | Create an entry for \code{node} in the tracker. Assumes that
184 | \code{node} is not in the tracker yet, and does not check if
185 | this is the case.
186 | \subsection{Usage}{
187 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$add_seed(seeds, preference)}\if{html}{\out{</div>}}
188 | }
189 | 
190 | \subsection{Arguments}{
191 | \if{html}{\out{<div class="arguments">}}
192 | \describe{
193 | \item{\code{seeds}}{The name of the node in the graph as a length 1
194 | character vector.}
195 | 
196 | \item{\code{preference}}{TODO: recall what on earth this is.}
197 | }
198 | \if{html}{\out{</div>}}
199 | }
200 | }
201 | \if{html}{\out{<hr>}}
202 | \if{html}{\out{<a id="method-Tracker-add_to_path"></a>}}
203 | \if{latex}{\out{\hypertarget{method-Tracker-add_to_path}{}}}
204 | \subsection{Method \code{add_to_path()}}{
205 | TODO
206 | \subsection{Usage}{
207 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$add_to_path(node)}\if{html}{\out{</div>}}
208 | }
209 | 
210 | \subsection{Arguments}{
211 | \if{html}{\out{<div class="arguments">}}
212 | \describe{
213 | \item{\code{node}}{The name of the node in the graph as a length 1
214 | character vector.}
215 | }
216 | \if{html}{\out{</div>}}
217 | }
218 | }
219 | \if{html}{\out{<hr>}}
220 | \if{html}{\out{<a id="method-Tracker-add_nodes"></a>}}
221 | \if{latex}{\out{\hypertarget{method-Tracker-add_nodes}{}}}
222 | \subsection{Method \code{add_nodes()}}{
223 | Create an entry for \code{node} in the tracker. Assumes that
224 | \code{node} is not in the tracker yet, and does not check if
225 | this is the case.
226 | \subsection{Usage}{
227 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$add_nodes(nodes, preference = 0)}\if{html}{\out{</div>}}
228 | }
229 | 
230 | \subsection{Arguments}{
231 | \if{html}{\out{<div class="arguments">}}
232 | \describe{
233 | \item{\code{nodes}}{The name(s) of node(s) in the graph as a character vector.}
234 | 
235 | \item{\code{preference}}{TODO: recall what on earth this is.}
236 | }
237 | \if{html}{\out{</div>}}
238 | }
239 | }
240 | \if{html}{\out{<hr>}}
241 | \if{html}{\out{<a id="method-Tracker-add_failed"></a>}}
242 | \if{latex}{\out{\hypertarget{method-Tracker-add_failed}{}}}
243 | \subsection{Method \code{add_failed()}}{
244 | Add \code{node} to the list of nodes we failed to visit.
245 | Assumes that \code{node} is not in the failed list yet, and
246 | does not check if this is the case.
247 | \subsection{Usage}{
248 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$add_failed(nodes)}\if{html}{\out{</div>}}
249 | }
250 | 
251 | \subsection{Arguments}{
252 | \if{html}{\out{<div class="arguments">}}
253 | \describe{
254 | \item{\code{nodes}}{The name of the node in the graph as a length 1
255 | character vector.}
256 | }
257 | \if{html}{\out{</div>}}
258 | }
259 | }
260 | \if{html}{\out{<hr>}}
261 | \if{html}{\out{<a id="method-Tracker-update_p"></a>}}
262 | \if{latex}{\out{\hypertarget{method-Tracker-update_p}{}}}
263 | \subsection{Method \code{update_p()}}{
264 | Update the estimate of the personalized pagerank for a given node
265 | \subsection{Usage}{
266 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$update_p(node)}\if{html}{\out{</div>}}
267 | }
268 | 
269 | \subsection{Arguments}{
270 | \if{html}{\out{<div class="arguments">}}
271 | \describe{
272 | \item{\code{node}}{Character name of a node in the graph.}
273 | }
274 | \if{html}{\out{</div>}}
275 | }
276 | }
277 | \if{html}{\out{<hr>}}
278 | \if{html}{\out{<a id="method-Tracker-update_r_neighbor"></a>}}
279 | \if{latex}{\out{\hypertarget{method-Tracker-update_r_neighbor}{}}}
280 | \subsection{Method \code{update_r_neighbor()}}{
281 | Update the residual of a \emph{good} node in the neighborhood of
282 | the current node, adding it to the tracker if necessary
283 | \subsection{Usage}{
284 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$update_r_neighbor(u, v)}\if{html}{\out{</div>}}
285 | }
286 | 
287 | \subsection{Arguments}{
288 | \if{html}{\out{<div class="arguments">}}
289 | \describe{
290 | \item{\code{u}}{Character name of the node we are currently visiting.}
291 | 
292 | \item{\code{v}}{Names of neighbors of \code{u} as a character vector. Can
293 | contain multiple elements. Can also contain zero elements.}
294 | }
295 | \if{html}{\out{</div>}}
296 | }
297 | }
298 | \if{html}{\out{<hr>}}
299 | \if{html}{\out{<a id="method-Tracker-update_r_self"></a>}}
300 | \if{latex}{\out{\hypertarget{method-Tracker-update_r_self}{}}}
301 | \subsection{Method \code{update_r_self()}}{
302 | Update the residual of current node
303 | \subsection{Usage}{
304 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$update_r_self(node)}\if{html}{\out{</div>}}
305 | }
306 | 
307 | \subsection{Arguments}{
308 | \if{html}{\out{<div class="arguments">}}
309 | \describe{
310 | \item{\code{node}}{Character name of the node we are currently visiting.}
311 | }
312 | \if{html}{\out{</div>}}
313 | }
314 | }
315 | \if{html}{\out{<hr>}}
316 | \if{html}{\out{<a id="method-Tracker-regularize"></a>}}
317 | \if{latex}{\out{\hypertarget{method-Tracker-regularize}{}}}
318 | \subsection{Method \code{regularize()}}{
319 | Compute the degree-adjusted and regularized variants of personalized
320 | PageRank as in Algorithm 4, based on the outputs of Algorithm 3.
321 | \subsection{Usage}{
322 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$regularize()}\if{html}{\out{</div>}}
323 | }
324 | 
325 | \subsection{Arguments}{
326 | \if{html}{\out{<div class="arguments">}}
327 | \describe{
328 | \item{\code{node}}{Character name of the node we are currently visiting.}
329 | }
330 | \if{html}{\out{</div>}}
331 | }
332 | }
333 | \if{html}{\out{<hr>}}
334 | \if{html}{\out{<a id="method-Tracker-calculate_ppr"></a>}}
335 | \if{latex}{\out{\hypertarget{method-Tracker-calculate_ppr}{}}}
336 | \subsection{Method \code{calculate_ppr()}}{
337 | Main driver function to perform the computations outlined in
338 | Algorithm 3.
339 | \subsection{Usage}{
340 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$calculate_ppr()}\if{html}{\out{</div>}}
341 | }
342 | 
343 | \subsection{Arguments}{
344 | \if{html}{\out{<div class="arguments">}}
345 | \describe{
346 | \item{\code{node}}{Character name of the node we are currently visiting.}
347 | }
348 | \if{html}{\out{</div>}}
349 | }
350 | }
351 | \if{html}{\out{<hr>}}
352 | \if{html}{\out{<a id="method-Tracker-clone"></a>}}
353 | \if{latex}{\out{\hypertarget{method-Tracker-clone}{}}}
354 | \subsection{Method \code{clone()}}{
355 | The objects of this class are cloneable with this method.
356 | \subsection{Usage}{
357 | \if{html}{\out{<div class="r">}}\preformatted{Tracker$clone(deep = FALSE)}\if{html}{\out{</div>}}
358 | }
359 | 
360 | \subsection{Arguments}{
361 | \if{html}{\out{<div class="arguments">}}
362 | \describe{
363 | \item{\code{deep}}{Whether to make a deep clone.}
364 | }
365 | \if{html}{\out{</div>}}
366 | }
367 | }
368 | }
369 | 


--------------------------------------------------------------------------------
/man/aPPR-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/aPPR-package.R
 3 | \docType{package}
 4 | \name{aPPR-package}
 5 | \alias{aPPR}
 6 | \alias{aPPR-package}
 7 | \title{aPPR: Approximate Personalized PageRank}
 8 | \description{
 9 | Calculates approximate and regularized personalized PageRank vectors for massive graphs, including those that can only be queried via an API. Regularization allows discovery of community structure under some stochastic block models.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://github.com/RoheLab/aPPR}
15 |   \item Report bugs at \url{https://github.com/RoheLab/aPPR/issues}
16 | }
17 | 
18 | }
19 | \author{
20 | \strong{Maintainer}: Alex Hayes \email{alexpghayes@gmail.com} (\href{https://orcid.org/0000-0002-4985-5160}{ORCID}) [copyright holder]
21 | 
22 | Authors:
23 | \itemize{
24 |   \item Fan Chen \email{fan.chen@wisc.edu} (\href{https://orcid.org/0000-0003-4508-6023}{ORCID})
25 |   \item Karl Rohe
26 | }
27 | 
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/abstract_graph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/abstract-graph.R
 3 | \name{abstract_graph}
 4 | \alias{abstract_graph}
 5 | \title{Create an abstract graph object}
 6 | \usage{
 7 | abstract_graph(subclass, ...)
 8 | }
 9 | \arguments{
10 | \item{subclass}{Desired subclass (character).}
11 | 
12 | \item{...}{Other arguments to pass to \code{list()}. See
13 | \code{\link[=rtweet_graph]{rtweet_graph()}} for an example.}
14 | }
15 | \description{
16 | Could be an actual graph object, or a graph such as the Twitter
17 | following network defined implicitly via API requests, etc.
18 | The abstract graph is just a list with \code{abstract_graph} class
19 | and your desired subclass.
20 | }
21 | 


--------------------------------------------------------------------------------
/man/appr.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/aPPR.R, R/graph-igraph.R, R/graph-rtweet.R
  3 | \name{appr}
  4 | \alias{appr}
  5 | \alias{appr.igraph}
  6 | \alias{appr.rtweet_graph}
  7 | \title{Approximate personalized pageranks}
  8 | \usage{
  9 | appr(
 10 |   graph,
 11 |   seeds,
 12 |   ...,
 13 |   alpha = 0.15,
 14 |   epsilon = 1e-06,
 15 |   tau = NULL,
 16 |   max_visits = Inf
 17 | )
 18 | 
 19 | \method{appr}{igraph}(graph, seeds, ...)
 20 | 
 21 | \method{appr}{rtweet_graph}(graph, seeds, ...)
 22 | }
 23 | \arguments{
 24 | \item{graph}{An \code{\link[=abstract_graph]{abstract_graph()}} object, such as that created by
 25 | \code{\link[=rtweet_graph]{rtweet_graph()}}. This argument is required.}
 26 | 
 27 | \item{seeds}{A character vector of seeds for the personalized pagerank.
 28 | The personalized pagerank will return to each of these seeds with
 29 | probability \code{alpha} at each node transition. At the moment,
 30 | all seeds are given equal weighting. This argument is required.}
 31 | 
 32 | \item{...}{Ignored. Passing arguments to \code{...} results in a warning.}
 33 | 
 34 | \item{alpha}{Teleportation constant. The teleportation constant is the
 35 | probability of returning to a seed node at each node transition.
 36 | \code{alpha} must be a valid probabilty; that is, between zero and one.
 37 | Defaults to \code{0.15}. This is the inverse of the "dampening factor"
 38 | in the original PageRank paper, so \code{alpha = 0.15} corresponds
 39 | to a dampening factor of \code{0.85}. Runtime is proportional to
 40 | \code{1 / (epsilon * alpha)}, so small \code{alpha} can result in long
 41 | runtimes.}
 42 | 
 43 | \item{epsilon}{Desired accuracy of approximation. \code{epsilon} must be
 44 | a small positive number. Defaults to \code{1e-6}. \code{aPPR} guarantees that
 45 | approximated personalized pageranks are uniformly within \code{epsilon} of
 46 | their true value. That is, the approximation is guaranteed to be good
 47 | in an L-infinity sense. This does not guarantee, however, that
 48 | a ranking of nodes by aPPR is close to a ranking of nodes by PPR.
 49 | 
 50 | For Twitter graphs, we recommend testing your code with \code{1e-4} or \code{1e-5},
 51 | using \code{1e-6} for exploration, and \code{1e-7} to \code{1e-8} for final results,
 52 | although these numbers are very rough. It also perfectly reasonable
 53 | to run \code{aPPR} for a given number of steps (set via \code{max_visits}),
 54 | and then note the approximation accuracy of your results. Internally,
 55 | \code{aPPR} keeps a running estimate of achieved accuracy that is always valid.
 56 | 
 57 | Anytime you would like to explore more of the graph, you can simply
 58 | decrease \code{epsilon}. So you can start with \code{epsilon = 1e-5} and then
 59 | gradually decrease \code{epsilon} until you have a sample of the graph
 60 | that you are happy with.
 61 | 
 62 | Also note that runtime is proportional to \code{1 / (epsilon * alpha)},
 63 | so small \code{epsilon} can result in long runtimes.}
 64 | 
 65 | \item{tau}{Regularization term. Additionally inflates the in-degree
 66 | of each observation by this term by performing the degree
 67 | adjustment described in Algorithm 3 and Algorithm 4, which
 68 | are described in \code{vignette("Mathematical details")}. Defaults to
 69 | \code{NULL}, in which case \code{tau} is set to the average in-degree of
 70 | the observed nodes. In general, setting it's reasonable to
 71 | set \code{tau} to the average in-degree of the graph.}
 72 | 
 73 | \item{max_visits}{Maximum number of unique nodes to visit. Should be a
 74 | positive integer. Defaults to \code{Inf}, such that there is no upper bound
 75 | on the number of unique nodes to visit. Useful when you want to specify a
 76 | fixed amount of computation (or API calls) to use rather than an
 77 | error tolerance. We recommend debugging with \code{max_visits ~ 20},
 78 | exploration with \code{max_visits} in the hundreds, and \code{max_visits} in the
 79 | thousands to ten of thousands for precise results, although this is a
 80 | very rough heuristic.}
 81 | }
 82 | \value{
 83 | A \code{\link[=Tracker]{Tracker()}} object. Most relevant is the \code{stats} field,
 84 | a \code{\link[tibble:tibble]{tibble::tibble()}} with the following columns:
 85 | \itemize{
 86 | \item \code{name}: Name of a node (character).
 87 | \item \code{p}: Current estimate of residual per out-degree for a node.
 88 | \item \code{r}: Estimated error of pagerank estimate for a node.
 89 | \item \code{in_degree}: Number of incoming edges to a node.
 90 | \item \code{out_degree}: Number of outcoming edges from a node.
 91 | \item \code{degree_adjusted}: The personalized pagerank divided by the
 92 | node in-degree.
 93 | \item \code{regularized}: The personalized pagerank divide by the node
 94 | in-degree plus \code{tau}.
 95 | }
 96 | 
 97 | When computing personalized pageranks for Twitter users (either
 98 | via \code{\link[=rtweet_graph]{rtweet_graph()}}, \code{name} is given
 99 | as a user ID, not a screen name, regardless of how the seed nodes
100 | were specified.
101 | }
102 | \description{
103 | Computes the personalized pagerank for specified seeds using the
104 | \code{ApproximatePageRank} algorithm of Andersen et al. (2006). Computes
105 | degree-adjustments and degree-regularization of personalized
106 | pagerank vectors as described in Algorithms 3 and 4 of Chen et al. (2019).
107 | These algorithms are randomized; if results are unstable across
108 | multiple runs, decrease \code{epsilon}.
109 | }
110 | \examples{
111 | 
112 | library(aPPR)
113 | library(igraph)
114 | 
115 | set.seed(27)
116 | 
117 | graph <- rtweet_graph()
118 | 
119 | \dontrun{
120 | appr(graph, "alexpghayes")
121 | }
122 | 
123 | graph2 <- sample_pa(100)
124 | 
125 | # this creates a Tracker object
126 | ppr_results <- appr(graph2, seeds = "5")
127 | 
128 | # the portion of the Tracker object you probably care about
129 | ppr_results$stats
130 | 
131 | }
132 | \references{
133 | \enumerate{
134 | \item Chen, Fan, Yini Zhang, and Karl Rohe. “Targeted Sampling from Massive Block Model Graphs with Personalized PageRank.” Journal of the Royal Statistical Society: Series B (Statistical Methodology) 82, no. 1 (February 2020): 99–126. https://doi.org/10.1111/rssb.12349.
135 | \item Andersen, Reid, Fan Chung, and Kevin Lang. “Local Graph Partitioning Using PageRank Vectors.” In 2006 47th Annual IEEE Symposium on Foundations of Computer Science (FOCS’06), 475–86. Berkeley, CA, USA: IEEE, 2006. https://doi.org/10.1109/FOCS.2006.44.
136 | }
137 | }
138 | 


--------------------------------------------------------------------------------
/man/check.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/abstract-graph.R
 3 | \name{check}
 4 | \alias{check}
 5 | \title{Check if a node an abstract graph is acceptable for inclusion in PPR}
 6 | \usage{
 7 | check(graph, nodes)
 8 | }
 9 | \arguments{
10 | \item{graph}{A graph object.}
11 | 
12 | \item{nodes}{The name(s) of node(s) in \code{graph} as a character vector.}
13 | }
14 | \value{
15 | The subset of \code{nodes} that are acceptable for inclusion. This
16 | can be a character vector of length zero if necessary. It is critical
17 | that no entries of \code{nodes} are duplicated in this output, so we
18 | recommend calling \code{unique()} if there is any potential for repeats
19 | in your checking good.
20 | }
21 | \description{
22 | Inclusion criteria:
23 | }
24 | \details{
25 | \itemize{
26 | \item At least one outgoing edge
27 | \item Can get in degree and out degree of node
28 | \item Can get all nodes connected to \code{node} / the 1-hop neighborhood
29 | }
30 | }
31 | 


--------------------------------------------------------------------------------
/man/neighborhood.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/abstract-graph.R
 3 | \name{neighborhood}
 4 | \alias{neighborhood}
 5 | \title{Get the neighborhood of a node in a graph}
 6 | \usage{
 7 | neighborhood(graph, node)
 8 | }
 9 | \arguments{
10 | \item{graph}{A graph object.}
11 | 
12 | \item{node}{The name of a single node in \code{graph} as a character vector.}
13 | }
14 | \value{
15 | A character vector of all nodes in \code{graph} connected such that
16 | there is an outgoing edge for \code{node} to those nodes. This should
17 | never be empty, as \code{neighborhood()} should not be called on nodes
18 | that fail \code{check()}, and \code{check()} enforces that nodes have out-degree
19 | of at least one. It is critical node names are duplicated in the
20 | output recommend calling \code{unique()} if there is any potential for
21 | for that to occur.
22 | }
23 | \description{
24 | That is, find all nodes connected to \code{node} by an outgoing edge.
25 | This function is memorized to avoid making repeated API queries.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/node_degrees.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/abstract-graph.R
 3 | \name{node_degrees}
 4 | \alias{node_degrees}
 5 | \title{Get the in-degree and out-degree of nodes in an abstract graph}
 6 | \usage{
 7 | node_degrees(graph, nodes)
 8 | }
 9 | \arguments{
10 | \item{graph}{A graph object.}
11 | 
12 | \item{nodes}{The name(s) of node(s) in \code{graph} as a character vector.
13 | Methods may assume that there are no repeated values in \code{nodes}.}
14 | }
15 | \value{
16 | A \code{\link[=data.frame]{data.frame()}} with one row for every node in \code{nodes} and
17 | two columns: \code{in_degree} and \code{out_degree}. In a symmetric graph,
18 | \code{in_degree} and \code{out_degree} should match.
19 | }
20 | \description{
21 | This function is only called nodes that have been \code{\link[=check]{check()}}'d. It is
22 | safe to assume that \code{nodes} is non-empty.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/rtweet_graph.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/graph-rtweet.R
 3 | \name{rtweet_graph}
 4 | \alias{rtweet_graph}
 5 | \title{Create an abstract representation of the Twitter friendship graph}
 6 | \usage{
 7 | rtweet_graph(retryonratelimit = TRUE, verbose = TRUE, n = 5000)
 8 | }
 9 | \arguments{
10 | \item{retryonratelimit}{If \code{TRUE}, and a rate limit is exhausted, will wait
11 | until it refreshes. Most Twitter rate limits refresh every 15 minutes.
12 | If \code{FALSE}, and the rate limit is exceeded, the function will terminate
13 | early with a warning; you'll still get back all results received up to
14 | that point. The default value, \code{NULL}, consults the option
15 | \code{rtweet.retryonratelimit} so that you can globally set it to \code{TRUE},
16 | if desired.
17 | 
18 | If you expect a query to take hours or days to perform, you should not
19 | rely solely on \code{retryonratelimit} because it does not handle other common
20 | failure modes like temporarily losing your internet connection.}
21 | 
22 | \item{verbose}{Show progress bars and other messages indicating current
23 | progress?}
24 | 
25 | \item{n}{Desired number of results to return. Results are downloaded
26 | in pages when \code{n} is large; the default value will download a single
27 | page. Set \code{n = Inf} to download as many results as possible.
28 | 
29 | The Twitter API rate limits the number of requests you can perform
30 | in each 15 minute period. The easiest way to download more than that is
31 | to use \code{retryonratelimit = TRUE}.
32 | 
33 | You are not guaranteed to get exactly \code{n} results back. You will get
34 | fewer results when tweets have been deleted or if you hit a rate limit.
35 | You will get more results if you ask for a number of tweets that's not
36 | a multiple of page size, e.g. if you request \code{n = 150} and the page
37 | size is 200, you'll get 200 results back.}
38 | }
39 | \description{
40 | Signifies that \code{aPPR} should query the Twitter friendship graph via
41 | \code{rtweet}.
42 | }
43 | 


--------------------------------------------------------------------------------
/man/update.Tracker.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/update.R
 3 | \name{update.Tracker}
 4 | \alias{update.Tracker}
 5 | \title{Update a Tracker object}
 6 | \usage{
 7 | \method{update}{Tracker}(object, ..., epsilon, max_visits)
 8 | }
 9 | \arguments{
10 | \item{object}{The \code{Tracker} object to update.}
11 | 
12 | \item{...}{Ignored. Passing arguments to \code{...} results in a warning.}
13 | 
14 | \item{epsilon}{Desired accuracy of approximation. \code{epsilon} must be
15 | a small positive number. Defaults to \code{1e-6}. \code{aPPR} guarantees that
16 | approximated personalized pageranks are uniformly within \code{epsilon} of
17 | their true value. That is, the approximation is guaranteed to be good
18 | in an L-infinity sense. This does not guarantee, however, that
19 | a ranking of nodes by aPPR is close to a ranking of nodes by PPR.
20 | 
21 | For Twitter graphs, we recommend testing your code with \code{1e-4} or \code{1e-5},
22 | using \code{1e-6} for exploration, and \code{1e-7} to \code{1e-8} for final results,
23 | although these numbers are very rough. It also perfectly reasonable
24 | to run \code{aPPR} for a given number of steps (set via \code{max_visits}),
25 | and then note the approximation accuracy of your results. Internally,
26 | \code{aPPR} keeps a running estimate of achieved accuracy that is always valid.
27 | 
28 | Anytime you would like to explore more of the graph, you can simply
29 | decrease \code{epsilon}. So you can start with \code{epsilon = 1e-5} and then
30 | gradually decrease \code{epsilon} until you have a sample of the graph
31 | that you are happy with.
32 | 
33 | Also note that runtime is proportional to \code{1 / (epsilon * alpha)},
34 | so small \code{epsilon} can result in long runtimes.}
35 | 
36 | \item{max_visits}{Maximum number of unique nodes to visit. Should be a
37 | positive integer. Defaults to \code{Inf}, such that there is no upper bound
38 | on the number of unique nodes to visit. Useful when you want to specify a
39 | fixed amount of computation (or API calls) to use rather than an
40 | error tolerance. We recommend debugging with \code{max_visits ~ 20},
41 | exploration with \code{max_visits} in the hundreds, and \code{max_visits} in the
42 | thousands to ten of thousands for precise results, although this is a
43 | very rough heuristic.}
44 | }
45 | \value{
46 | A new \code{Tracker} object with a new value of \code{epsilon}.
47 | }
48 | \description{
49 | Typically because results are insufficiently precise.
50 | }
51 | \details{
52 | At the moment, only supports changing \code{epsilon}. If there is interest,
53 | we can consider allowing updates to \code{tau}, \code{alpha} and \code{seeds} in the
54 | future.
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(aPPR)
3 | 
4 | test_check("aPPR")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-matches-igraph.R:
--------------------------------------------------------------------------------
 1 | skip_if_not_installed("igraph")
 2 | library(igraph)
 3 | 
 4 | library(logger)
 5 | 
 6 | log_threshold(WARN, namespace = "aPPR")
 7 | 
 8 | prefer <- function(node, total_nodes = 100) {
 9 |   alpha <- numeric(total_nodes)
10 |   alpha[node] <- 1
11 |   alpha
12 | }
13 | 
14 | test_that("matches igraph calculations on connected graph", {
15 | 
16 |   # graph without sink nodes (i.e. every node has an outgoing edge)
17 |   g3 <- make_ring(10)
18 | 
19 |   # make every node a seed node to recover page rank
20 |   appr_ppr <- appr(g3, seeds = as.character(1:10))
21 | 
22 |   # close enough but currently failing
23 |   expect_equal(sum(appr_ppr$stats$p), 1, tolerance = 1e-4)
24 | 
25 |   appr_ppr2 <- appr(g3, seeds = "1")
26 | 
27 |   igraph_ppr <- page_rank(g3, personalized = prefer(1, 10))$vector
28 | 
29 |   # tolerance off by an order of magnitude again?
30 |   expect_equal(sort(appr_ppr2$stats$p), sort(igraph_ppr), tolerance = 1e-4)
31 | })
32 | 
33 | # did this ever work? i don't think it should
34 | #
35 | # test_that("matches igraph calculations on graph with sink nodes", {
36 | #
37 | #   set.seed(26)
38 | #
39 | #   ig <- sample_pa(100)
40 | #
41 | #   # make every node a seed node to recover page rank
42 | #   appr_ppr <- appr(ig, seeds = as.character(2:10))
43 | #
44 | #   # close enough but currently failing
45 | #   expect_equal(sum(appr_ppr$stats$p), 1, tolerance = 1e-5)
46 | #
47 | #   appr_ppr2 <- appr(ig, seeds = "1")
48 | #
49 | #   igraph_ppr <- page_rank(ig, personalized = prefer(1, 10))$vector
50 | #
51 | #   # tolerance off by an order of magnitude again?
52 | #   expect_equal(sort(appr_ppr2$stats$p), sort(igraph_ppr), tolerance = 1e-5)
53 | # })
54 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/extending-appr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Extending aPPR"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{Extending aPPR}
  6 |   %\VignetteEngine{knitr::rmarkdown}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>"
 14 | )
 15 | ```
 16 | 
 17 | Suppose you want to calculate personalized PageRanks for some graph that is not supported by `aPPR`. You can extend `aPPR` to arbitrary graph objects, so long the graph object is an S3 object with methods:
 18 | 
 19 | - `check()`
 20 | - `node_degrees()`
 21 | - `neighborhood()`
 22 | - `appr()` (optional)
 23 | 
 24 | See the documentation for those S3 generics to understand the generic specification fully!
 25 | 
 26 | We demonstrate how to implement these methods below for a new, custom graph object. In this case, we will consider the PubMed citation network, which we will interact via the PubMed API, using the `rentrez` package. First, we define a constructor function that returns a custom S3 graph object that subclasses `abstract_graph`. You can install rentrez with `pak::pak("ropensci/rentrez")`, and do not need to set up any authentication to begin using the API.
 27 | 
 28 | ```{r}
 29 | library(aPPR)
 30 | library(logger)
 31 | library(glue)
 32 | library(rentrez)
 33 | 
 34 | # constructor for PubMed graph object (defined over API)
 35 | pubmed_graph <- function(max_attempts = 3) {
 36 |   if (!requireNamespace("rentrez", quietly = TRUE)) {
 37 |     stop(
 38 |       "`rentrez` package must be installed to use `pubmed_graph()`",
 39 |       call. = FALSE
 40 |     )
 41 |   }
 42 | 
 43 |   agraph <- abstract_graph(
 44 |     subclass = "pubmed",
 45 |     max_attempts = max_attempts
 46 |   )
 47 | 
 48 |   agraph
 49 | }
 50 | 
 51 | graph <- pubmed_graph()
 52 | graph
 53 | ```
 54 | 
 55 | Now we want to implement S3 methods for the `pubmed` object. In some cases, you can query data from a graph in large batches, but with the PubMed API it simpler (at least to my limited knowledge) to query node by node, with no bulk lookups. It turns out that we can get neighborhoods and node in-degree and node out-degree all at once, using `rentrez::entrez_link()`. We give the function three attempts (by default) to successfully complete this API call, since APIs sometimes fail. Then, since we will need this information repeatedly, we memoize the function, to avoid repeated calls to the API.
 56 | 
 57 | ```{r}
 58 | # one node at a time
 59 | get_pubmed_data <- function(graph, node) {
 60 |   for (i in 1:graph$max_attempts) {
 61 |     log_trace(
 62 |       glue("Attempt {i}/{graph$max_attempts} to get node degrees: {node}")
 63 |     )
 64 | 
 65 |     tryCatch(
 66 |       {
 67 |         cites <- entrez_link(dbfrom = "pubmed", db = "all", id = node)
 68 |         break
 69 |       },
 70 |       error = function(cnd) {
 71 |         if (i == graph$max_attempts) {
 72 |           log_debug(
 73 |             glue("Maximum attempts to find neighborhood met, could not find: {node}")
 74 |           )
 75 |           stop("Couldn't pull data for node")
 76 |         }
 77 |       }
 78 |     )
 79 |   }
 80 | 
 81 |   data <- list(
 82 |     refs = unique(cites$links$pubmed_pubmed_refs),
 83 |     citedby = unique(cites$links$pubmed_pubmed_citedin)
 84 |   )
 85 | 
 86 |   data$num_refs <- length(data$refs)
 87 |   data$num_citedby <- length(data$citedby)
 88 |   data
 89 | }
 90 | 
 91 | memo_get_pubmed_data <- memoise::memoise(get_pubmed_data)
 92 | ```
 93 | 
 94 | Now we test the function. I'm not currently sure that it's working: it's suspicious for two papers to have in-degree and out-degree all equal to 18 -- we need to dig into this and find out if the API is limited to returning a maximum of API results in a single call, for example.
 95 | 
 96 | ```{r}
 97 | good_node_ids <- c("30345262", "29624432", "29867837")
 98 | bad_node_id <- "I am a pumpkin"
 99 | mixed_node_ids <- c(good_node_ids, bad_node_id)
100 | 
101 | # this is suspicious to me, something seems wrong here
102 | memo_get_pubmed_data(graph, good_node_ids[1])
103 | 
104 | # suspicious that the number of in-cites and out-cites matches, and that it
105 | # matches across both papers! TODO: investigate!
106 | memo_get_pubmed_data(graph, good_node_ids[2])
107 | memo_get_pubmed_data(graph, good_node_ids[3])
108 | 
109 | # check that we handle bad node ids in some reliable way, in this case
110 | # it looks we get empty results
111 | memo_get_pubmed_data(graph, bad_node_id)
112 | ```
113 | 
114 | ```{r}
115 | #' Check method for `pubmed` graph objects
116 | #'
117 | #' @param graph A `pubmed` graph object
118 | #' @param nodes A **character** vector of node ids. **Can be empty!**
119 | #'
120 | #' @return A **character** vector of node ids that we can reach in the graph.
121 | #'   For example, some nodes ids may not be reachable due to API failures,
122 | #'   or, more generally, permissions failures.
123 | #'
124 | #'   If `nodes` is the empty vector, returns the empty vector. Be sure to
125 | #'   handle this edge case.
126 | check.pubmed <- function(graph, nodes) {
127 |   log_debug(glue("Checking nodes"))
128 | 
129 |   # handle the case where no nodes are passed
130 |   if (length(nodes) < 1) {
131 |     return(character(0))
132 |   }
133 | 
134 |   good_nodes <- character(0)
135 | 
136 |   for (node in nodes) {
137 |     node_data <- memo_get_pubmed_data(graph, node)
138 | 
139 |     # this is a sufficient check to see if (1) the node is in pubmed, (2)
140 |     # we can pull it's neighborhood, and (3) it has at least one
141 |     # incoming or outgoing citation
142 | 
143 |     if (node_data$num_refs + node_data$num_citedby > 0) {
144 |       log_trace(glue("Checked node: {node} (good)"))
145 |       good_nodes <- c(good_nodes, node)
146 |       next
147 |     }
148 | 
149 |     log_trace(glue("Checked node: {node} (bad)"))
150 |   }
151 | 
152 |   good_nodes
153 | }
154 | ```
155 | 
156 | Now we test our implementation. To do this, we should give at least one good node id, and at least one bad node id. Only the good node id should be returned.
157 | 
158 | ```{r}
159 | check(graph, good_node_ids)
160 | check(graph, bad_node_id)
161 | check(graph, mixed_node_ids)
162 | ```
163 | 
164 | ```{r}
165 | #' Degree method for `pubmed` graph objects
166 | #'
167 | #' @param graph A `pubmed` graph object
168 | #' @param nodes A **character** vector of node ids. **Cannot be empty.** Should
169 | #'   not contain duplicates if `check()` is properly implemented and does
170 | #'   not output duplicates.
171 | #'
172 | #' @return A list, with two elements, `in_degree` and `out_degree`. Both
173 | #'   should be the same length as `nodes`, and match the order of `nodes`.
174 | #'
175 | node_degrees.pubmed <- function(graph, nodes) {
176 |   log_debug(glue("Getting node degrees"))
177 | 
178 |   degrees <- list(
179 |     in_degree = integer(length(nodes)),
180 |     out_degree = integer(length(nodes))
181 |   )
182 | 
183 |   for (i in seq_along(nodes)) {
184 |     log_debug(glue("Getting node degrees for node: {nodes[i]}"))
185 |     node_data <- memo_get_pubmed_data(graph, nodes[i])
186 | 
187 |     # must treat pubmed like an undirected graph. otherwise it's a citation
188 |     # network, and thus a tree, and thus no pair of nodes is mutually
189 |     # reachable, thus pagerank is not defined
190 | 
191 |     degrees$in_degree[i] <- node_data$num_citedby + node_data$num_refs
192 |     degrees$out_degree[i] <- node_data$num_citedby + node_data$num_refs
193 | 
194 |     log_trace(glue("In-degree for node: {degrees$in_degree[i]}"))
195 |     log_trace(glue("Out-degree for node: {degrees$out_degree[i]}"))
196 |   }
197 | 
198 |   log_debug(glue("Done getting node degrees"))
199 | 
200 |   degrees
201 | }
202 | ```
203 | 
204 | To test this method, we should pass a character vector of several good node ids.
205 | 
206 | ```{r}
207 | # test with a single node
208 | node_degrees(graph, good_node_ids[1])
209 | 
210 | # test with multiple nodes! this is the key one! this is suspicious, and
211 | # means we need to check if our function memo_get_pubmed_data() is working
212 | node_degrees(graph, good_node_ids)
213 | ```
214 | 
215 | ```{r}
216 | #' Neighborhood method for `pubmed` graph objects
217 | #'
218 | #' @param graph A `pubmed` graph object
219 | #' @param nodes A length one character vector, for a node in the graph with
220 | #'   at least one outgoing edge.
221 | #'
222 | #' @return A **character** vector of node ids for the graph neighborhood.
223 | #'   Should be a vector of length at least one (if the `check()` method was
224 | #'   implemented correctly), and should not contain duplicates.
225 | neighborhood.pubmed <- function(graph, node) {
226 |   if (length(node) > 1) {
227 |     stop("`node` must be a character vector of length one.")
228 |   }
229 | 
230 |   log_debug(glue("Getting neighborhood: {node}"))
231 |   node_data <- memo_get_pubmed_data(graph, node)
232 |   log_debug(glue("Done getting neighborhood: {node}"))
233 |   unique(node_data$refs, node_data$citedby)
234 | }
235 | ```
236 | 
237 | ```{r}
238 | neighborhood(graph, good_node_ids[1])
239 | neighborhood(graph, good_node_ids[2])
240 | neighborhood(graph, good_node_ids[3])
241 | ```
242 | 
243 | Lastly, you can optionally implement an `appr` method for your abstract graph subclass. In the `appr` method for the subclass, you can do things like:
244 | 
245 | - Add functionality to convert a convenient seed node name (in this, possibly something like a DOI) into the internal node name representation (see `appr.rtweet_graph()` for an example of this)
246 | - Checks that you have appropriate authorization to pull information about the seed nodes
247 | - Etc, etc
248 | 
249 | This custom subclass method will run before the general `appr.abstract_graph()`. We don't have a particular need to do anything of that here, so we do not.
250 | 
251 | ### Debugging
252 | 
253 | If you are accessing a graph over an API, it's likely that you will encounter edge cases where the API returns no data, or data in a format that you did not expect. We highly recommend using logging to debug your implementation when this happens, using the `logger` library. See that `logger` documentation for details. 
254 | 
255 | Find any errors, fix, and rinse and repeat until you've completed the likely unpleasant task of tracking down all the edges cases in the API. In our, we don't seem to find any edge cases right away.
256 | 
257 | ```{r}
258 | library(logger)
259 | 
260 | # set logging threshold for code you just wrote, if desired
261 | log_threshold(TRACE)
262 | 
263 | # set logging threshold for aPPR package functions, if desired
264 | log_threshold(DEBUG, namespace = "aPPR")
265 | 
266 | appr(
267 |   graph,                     # the graph to work with
268 |   seeds = good_node_ids[1],  # name of seed node (character)
269 |   epsilon = 0.0005,          # desired approximation quality
270 |   max_visits = 10            # bound computation since this is an example
271 | )
272 | ```
273 | 


--------------------------------------------------------------------------------