├── LICENSE
├── data
    ├── cascades.RData
    ├── policies.RData
    ├── validation.RData
    └── sim_validation.RData
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test_plot.R
    │   ├── test_netinf.R
    │   ├── test_simulation.R
    │   └── test_cascade.R
├── src
    ├── vuong_test.h
    ├── vuong_test.cpp
    ├── NetworkInference_init.c
    ├── distributions.h
    ├── distributions.cpp
    ├── possible_edges.h
    ├── netinf_utilities.h
    ├── RcppExports.cpp
    ├── netinf_utilities.cpp
    ├── possible_edges.cpp
    ├── spanning_tree.h
    ├── netinf.h
    ├── spanning_tree.cpp
    └── netinf.cpp
├── readme_figures
    ├── README-unnamed-chunk-5-1.png
    ├── README-unnamed-chunk-5-2.png
    ├── README-unnamed-chunk-9-1.png
    ├── README-unnamed-chunk-9-2.png
    └── README-unnamed-chunk-9-3.png
├── .travis.yml
├── .Rbuildignore
├── cran-comments.md
├── R
    ├── zzz.R
    ├── RcppExports.R
    ├── NetworkInference.R
    ├── summary.R
    ├── data.R
    ├── plot.R
    ├── netinf.R
    ├── simulate_cascades.R
    └── cascade.R
├── man
    ├── is.cascade.Rd
    ├── is.diffnet.Rd
    ├── count_possible_edges.Rd
    ├── sim_validation.Rd
    ├── simulate_rnd_cascades.Rd
    ├── subset_cascade.Rd
    ├── drop_nodes.Rd
    ├── cascades.Rd
    ├── subset_cascade_time.Rd
    ├── as.matrix.cascade.Rd
    ├── validation.Rd
    ├── as.data.frame.cascade.Rd
    ├── plot.diffnet.Rd
    ├── summary.cascade.Rd
    ├── plot.cascade.Rd
    ├── NetworkInference.Rd
    ├── as_cascade_wide.Rd
    ├── simulate_cascades.Rd
    ├── as_cascade_long.Rd
    ├── policies.Rd
    └── netinf.Rd
├── release_checklist.md
├── NAMESPACE
├── vignettes
    ├── bibliography.bib
    ├── quickstart_vignette.Rmd
    └── tutorial_vignette.Rmd
├── .gitignore
├── DESCRIPTION
├── netinf_benchmark.R
├── inst
    └── CITATION
├── README.Rmd
├── README.md
└── NEWS.md


/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017
2 | COPYRIGHT HOLDER: Fridolin Linder


--------------------------------------------------------------------------------
/data/cascades.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/cascades.RData


--------------------------------------------------------------------------------
/data/policies.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/policies.RData


--------------------------------------------------------------------------------
/data/validation.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/validation.RData


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(NetworkInference)
3 | 
4 | test_check("NetworkInference")
5 | 


--------------------------------------------------------------------------------
/src/vuong_test.h:
--------------------------------------------------------------------------------
1 | using namespace Rcpp;
2 | double vuong_test(NumericVector x1, NumericVector x2, bool bic=false);


--------------------------------------------------------------------------------
/data/sim_validation.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/sim_validation.RData


--------------------------------------------------------------------------------
/readme_figures/README-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/readme_figures/README-unnamed-chunk-5-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-5-2.png


--------------------------------------------------------------------------------
/readme_figures/README-unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/readme_figures/README-unnamed-chunk-9-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-2.png


--------------------------------------------------------------------------------
/readme_figures/README-unnamed-chunk-9-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-3.png


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Sample .travis.yml for R projects
2 | 
3 | language: r
4 | warnings_are_errors: true
5 | sudo: required
6 | branches:
7 |   only:
8 |     - master
9 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^\.travis\.yml$
 5 | cran-comments.md
 6 | ^README\.Rmd$
 7 | readme_figures/
 8 | README.md
 9 | release_checklist.md
10 | ^revdep$
11 | netinf_benchmark.R
12 | vignettes/*_cache/
13 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | Version 1.2.5
 2 | 
 3 | Maintenance update: remove C++11 requirement per CRAN request; modernize CITATION file.
 4 | 
 5 | ## R CMD check results
 6 | 
 7 | * No errors, warnings, notes
 8 | 
 9 | ## Downstream dependencies
10 | There are no downstream dependencies
11 | 


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
 1 | .onAttach <- function(...) {
 2 |       packageStartupMessage(
 3 | 'NetworkInference: Inferring latent diffusion networks.
 4 | Version 1.2.4.9000 (Development Version)
 5 | copyright (c) 2016, Fridolin Linder, Pennsylvania State University
 6 |                     Bruce Desmarais, Pennsylvania State University
 7 | For citation information, type citation("NetworkInference").
 8 | Type help("NetworkInference") to get started.\n'
 9 |       )
10 |   }
11 | 


--------------------------------------------------------------------------------
/tests/testthat/test_plot.R:
--------------------------------------------------------------------------------
 1 | library(NetworkInference)
 2 | 
 3 | context("Test if plotting works.")
 4 | 
 5 | data(cascades)
 6 | 
 7 | test_that("Plotting function runs.", {
 8 |     p <- plot(cascades, selection = names(cascades$cascade_nodes)[1:3])
 9 |     p <- plot(cascades, label_nodes = FALSE)
10 |     res <- netinf(cascades, n_edges = 6, params = 1, quiet = TRUE)
11 |     #plot(res, "network")
12 |     p <- plot(res, "improvement")
13 |     expect_warning(plot(cascades))
14 | })


--------------------------------------------------------------------------------
/src/vuong_test.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include "distributions.h"
 3 | 
 4 | using namespace Rcpp;
 5 | 
 6 | double vuong_test(NumericVector x1, NumericVector x2, bool bic=false) {
 7 |     //x1: old (k edges), x2: new (k+1 edges)
 8 |     //x2 = x2 - (log(float(x2.size())) / (2 * float(x2.size())));
 9 |     x2 = x2 - (1 / float(x2.size()));
10 |     NumericVector liks = x2 - x1;
11 |     double sd = Rcpp::sd(liks);
12 |     double stat = sum(liks) / (sd * sqrt(float(x2.size())));
13 |     double pval = 1 - normal_cdf(stat);
14 |     return pval;
15 | }


--------------------------------------------------------------------------------
/man/is.cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{is.cascade}
 4 | \alias{is.cascade}
 5 | \title{Is the object of class cascade?}
 6 | \usage{
 7 | is.cascade(object)
 8 | }
 9 | \arguments{
10 | \item{object}{the object to be tested.}
11 | }
12 | \value{
13 | \code{TRUE} if object is a cascade, \code{FALSE} otherwise.
14 | }
15 | \description{
16 | Is the object of class cascade?
17 | }
18 | \examples{
19 | 
20 | data(cascades)
21 | is.cascade(cascades)
22 | # > TRUE
23 | is.cascade(1) 
24 | # > FALSE
25 | }
26 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | netinf_ <- function(cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff) {
 5 |     .Call(`_NetworkInference_netinf_`, cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff)
 6 | }
 7 | 
 8 | count_possible_edges_ <- function(cascade_nodes, cascade_times, quiet = TRUE) {
 9 |     .Call(`_NetworkInference_count_possible_edges_`, cascade_nodes, cascade_times, quiet)
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/release_checklist.md:
--------------------------------------------------------------------------------
 1 | * Increment version number in DESCRIPTION
 2 | * Update version date in DESCRIPTION
 3 | * Increment version number in R/zzz.R
 4 | * Update NEWS.md
 5 | * Check on winbuilder (`devtools::build_win()`)
 6 | * Check locally (with valgrind) (`devtools:check(args = "--use-valgrind")`)
 7 | * Push to github and wait for Travis check
 8 | * Check reverse dependencies (`devtools::revdep_check()`)
 9 | * Update cran-comments.md
10 | * Release on cran
11 | * Wait until released
12 | * Create Github tag
13 | * Set version number to devel in DESCRIPTION
14 | * Set version number to devel in R/zzz.R
15 | * Update News with devel version 
16 | * Push to github


--------------------------------------------------------------------------------
/tests/testthat/test_netinf.R:
--------------------------------------------------------------------------------
 1 | library(NetworkInference)
 2 | 
 3 | context("Test if core netinf method works")
 4 | 
 5 | data(cascades)
 6 | data(validation)
 7 | test_that("netinf produces the edges as original netinf executable.", {
 8 |     from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential",
 9 |                           n_edges = 5, quiet = TRUE)
10 |     t1 <- from_netinf[order(from_netinf[, 1], from_netinf[, 2]), c(-3, -4)]
11 |     rownames(t1) <- c(1:nrow(t1))
12 |     t2 <- validation[order(validation[, 1], validation[, 2]), -c(3:6)]
13 |     rownames(t2) <- c(1:nrow(t2))
14 |     class(t2) <- c("diffnet", "data.frame")
15 |     expect_equal(t1, t2)
16 | })


--------------------------------------------------------------------------------
/man/is.diffnet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/netinf.R
 3 | \name{is.diffnet}
 4 | \alias{is.diffnet}
 5 | \title{Is the object of class diffnet?}
 6 | \usage{
 7 | is.diffnet(object)
 8 | }
 9 | \arguments{
10 | \item{object}{the object to be tested.}
11 | }
12 | \value{
13 | \code{TRUE} if object is a diffnet, \code{FALSE} otherwise.
14 | }
15 | \description{
16 | Tests if an object is of class diffnet. The class diffnet is appended to the 
17 | object returned by \code{\link{netinf}} for dispatch of appropriate plotting 
18 | methods.
19 | }
20 | \examples{
21 | 
22 | data(cascades)
23 | result <- netinf(cascades, n_edges = 6, params = 1)
24 | is.diffnet(result)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/count_possible_edges.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/netinf.R
 3 | \name{count_possible_edges}
 4 | \alias{count_possible_edges}
 5 | \title{Count the number of possible edges in the dataset}
 6 | \usage{
 7 | count_possible_edges(cascades)
 8 | }
 9 | \arguments{
10 | \item{cascades}{Object of class cascade containing the data.}
11 | }
12 | \value{
13 | An integer count.
14 | }
15 | \description{
16 | Across all cascades, count the edges that are possible. An edge from node
17 | \code{u} to node \code{v}
18 | is only possible if in at least one cascade \code{u} experienced an event 
19 | before \code{v}.
20 | }
21 | \examples{
22 | data(cascades)
23 | count_possible_edges(cascades)
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/man/sim_validation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{sim_validation}
 5 | \alias{sim_validation}
 6 | \title{Larger simulated validation network.}
 7 | \format{An object of class \code{data.frame} with 4 columns, containing:
 8 | \describe{
 9 |   \item{origin_node}{Origin of diffusion edge.}
10 |   \item{destination_node}{Destination node of diffusion edge.}
11 |   \item{improvement}{Improvement in score for the edge}
12 |   \item{p-value}{p-value for vuong test}
13 | }}
14 | \source{
15 | See code below.
16 | }
17 | \usage{
18 | data(sim_validation)
19 | }
20 | \description{
21 | A network from simulated data. For testing purposes.
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(as.data.frame,cascade)
 4 | S3method(as.matrix,cascade)
 5 | S3method(plot,cascade)
 6 | S3method(plot,diffnet)
 7 | S3method(summary,cascade)
 8 | export(as_cascade_long)
 9 | export(as_cascade_wide)
10 | export(count_possible_edges)
11 | export(drop_nodes)
12 | export(is.cascade)
13 | export(is.diffnet)
14 | export(netinf)
15 | export(simulate_cascades)
16 | export(simulate_rnd_cascades)
17 | export(subset_cascade)
18 | export(subset_cascade_time)
19 | import(assertthat)
20 | import(checkmate)
21 | import(ggplot2)
22 | import(ggrepel)
23 | importFrom(Rcpp,sourceCpp)
24 | importFrom(stats,density)
25 | importFrom(stats,runif)
26 | useDynLib(NetworkInference, .registration = TRUE)
27 | 


--------------------------------------------------------------------------------
/vignettes/bibliography.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{gomez2010inferring,
 2 |   title={Inferring networks of diffusion and influence},
 3 |   author={Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas},
 4 |   booktitle={Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining},
 5 |   pages={1019--1028},
 6 |   year={2010},
 7 |   organization={ACM}
 8 | }
 9 | 
10 | @article{desmarais2015persistent,
11 |   title={Persistent Policy Pathways: Inferring Diffusion Networks in the American States},
12 |   author={Desmarais, Bruce A and Harden, Jeffrey J and Boehmke, Frederick J},
13 |   journal={American Political Science Review},
14 |   volume={109},
15 |   number={02},
16 |   pages={392--406},
17 |   year={2015},
18 |   publisher={Cambridge Univ Press}
19 | }
20 | 


--------------------------------------------------------------------------------
/man/simulate_rnd_cascades.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simulate_cascades.R
 3 | \name{simulate_rnd_cascades}
 4 | \alias{simulate_rnd_cascades}
 5 | \title{Simulate a set of random cascades}
 6 | \usage{
 7 | simulate_rnd_cascades(n_cascades, n_nodes)
 8 | }
 9 | \arguments{
10 | \item{n_cascades}{Number of cascades to generate.}
11 | 
12 | \item{n_nodes}{Number of nodes in the system.}
13 | }
14 | \value{
15 | A data frame containing (in order of columns) node ids, 
16 |     event time and cascade identifier.
17 | }
18 | \description{
19 | Simulate random cascades, for testing and demonstration purposes. No actual 
20 | diffusion model is underlying these cascades.
21 | }
22 | \examples{
23 | 
24 | df <- simulate_rnd_cascades(10, n_nodes = 20)
25 | head(df)
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/NetworkInference_init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* .Call calls */
 7 | extern SEXP _NetworkInference_count_possible_edges_(SEXP, SEXP, SEXP);
 8 | extern SEXP _NetworkInference_netinf_(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP,
 9 |                                       SEXP);
10 | 
11 | static const R_CallMethodDef CallEntries[] = {
12 |     {"_NetworkInference_count_possible_edges_", (DL_FUNC) &_NetworkInference_count_possible_edges_, 3},
13 |     {"_NetworkInference_netinf_",               (DL_FUNC) &_NetworkInference_netinf_,               8},
14 |     {NULL, NULL, 0}
15 | };
16 | 
17 | void R_init_NetworkInference(DllInfo *dll)
18 | {
19 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
20 |     R_useDynamicSymbols(dll, FALSE);
21 | }
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | # Example code in package build process
 8 | *-Ex.R
 9 | # Output files from R CMD build
10 | /*.tar.gz
11 | # Output files from R CMD check
12 | /*.Rcheck/
13 | # RStudio files
14 | .Rproj.user/
15 | *.Rproj
16 | # produced vignettes
17 | vignettes/*.html
18 | vignettes/*.pdf
19 | vignettes/*_cache/
20 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
21 | .httr-oauth
22 | # knitr and R markdown default cache directories
23 | /*_cache/
24 | /cache/
25 | # Temporary files created by R markdown
26 | *.utf8.md
27 | *.knit.md
28 | .Rproj.user
29 | # Compiled files
30 | *.o
31 | *.so
32 | *.a
33 | # Development note
34 | devel_notes.md
35 | # Generated test data
36 | data/test_out.txt
37 | inst/doc
38 | README_old.md
39 | readme.html
40 | /revdep
41 | /revdep/.cache.rds
42 | *.swp
43 | 


--------------------------------------------------------------------------------
/man/subset_cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{subset_cascade}
 4 | \alias{subset_cascade}
 5 | \title{Select a subset of cascades from cascade object}
 6 | \usage{
 7 | subset_cascade(cascade, selection)
 8 | }
 9 | \arguments{
10 | \item{cascade}{cascade, object to select from}
11 | 
12 | \item{selection}{character or integer, vector of cascade_ids to select}
13 | }
14 | \value{
15 | An object of class cascade containing just the selected cascades
16 | }
17 | \description{
18 | Select a subset of cascades from cascade object
19 | }
20 | \examples{
21 | 
22 | data(policies)
23 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
24 |                             event_time = 'adopt_year', cascade_id = 'policy')
25 | cascade_names <- names(cascades$cascade_times)
26 | subset_cascade(cascades, selection = cascade_names[1:10])
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/man/drop_nodes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{drop_nodes}
 4 | \alias{drop_nodes}
 5 | \title{Drop nodes from a cascade object}
 6 | \usage{
 7 | drop_nodes(cascades, nodes, drop = TRUE)
 8 | }
 9 | \arguments{
10 | \item{cascades}{cascade, object to drop nodes from.}
11 | 
12 | \item{nodes}{character or integer, vector of node_ids to drop.}
13 | 
14 | \item{drop}{logical, Should empty cascades be dropped.}
15 | }
16 | \value{
17 | An object of class cascade containing the cascades without the 
18 |     dropped nodes.
19 | }
20 | \description{
21 | Drop nodes from a cascade object
22 | }
23 | \examples{
24 | 
25 | data(policies)
26 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
27 |                             event_time = 'adopt_year', cascade_id = 'policy')
28 | new_cascades <- drop_nodes(cascades, c("California", "New York"))
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/cascades.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{cascades}
 5 | \alias{cascades}
 6 | \title{Example cascades}
 7 | \format{An object of class \code{cascade} containing 4 objects
 8 | \describe{
 9 |   \item{node_names}{Character node names}
10 |   \item{cascade_nodes}{A list of integer vectors. Each containing the names of the
11 |       nodes infected in this cascades in the order of infection}
12 |   \item{cascade_times}{A list of numeric vectors. Each containing the infection
13 |       times for the corresponding nodes in cascade_nodes}
14 | }}
15 | \source{
16 | \url{https://github.com/snap-stanford/snap/blob/master/examples/netinf/example-cascades.txt}
17 | }
18 | \usage{
19 | data(cascades)
20 | }
21 | \description{
22 | An example dataset of 31 nodes and 54 cascades. From the original netinf 
23 | implementation in SNAP.
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/src/distributions.h:
--------------------------------------------------------------------------------
 1 | using namespace Rcpp;
 2 | 
 3 | /**
 4 |  * Exponential density
 5 |  * 
 6 |  * @param x Value to evaluate.
 7 |  * @param lambda Rate paramter of the distribution.
 8 |  * 
 9 |  * @return Density value
10 |  */
11 | double dexp_(double x, double lambda);
12 | 
13 | /**
14 |  * Rayleigh density
15 |  * 
16 |  * @param x Value to evaluate.
17 |  * @param lambda shape paramter of the distribution.
18 |  * 
19 |  * @return Density value
20 |  */
21 | double drayleigh_(double x, double lambda);
22 | 
23 | /**
24 |  * Log-normal density
25 |  * 
26 |  * @param x Value to evaluate.
27 |  * @param mu mean
28 |  * @param sigma variance
29 |  * 
30 |  * @return Density value
31 |  */
32 | double dlognormal_(double x, double mu, double sigma);
33 | 
34 | /**
35 |  * Cumulative distribution function of the standard normal distribution
36 |  * 
37 |  * @param x Value to evaluate.
38 |  * 
39 |  * @return Probability of X > x.
40 |  */
41 | double normal_cdf(double x);
42 | 


--------------------------------------------------------------------------------
/man/subset_cascade_time.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{subset_cascade_time}
 4 | \alias{subset_cascade_time}
 5 | \title{Subset a cascade object in time}
 6 | \usage{
 7 | subset_cascade_time(cascade, start_time, end_time, drop = TRUE)
 8 | }
 9 | \arguments{
10 | \item{cascade}{cascade, object to subset.}
11 | 
12 | \item{start_time}{numeric, start time of the subset.}
13 | 
14 | \item{end_time}{numeric, end time of the subset.}
15 | 
16 | \item{drop}{logical, should empty sub-cascades be dropped?}
17 | }
18 | \value{
19 | An object of class cascade, where only events are included that have 
20 |     times \code{start_time} <= t < \code{end_time}.
21 | }
22 | \description{
23 | Remove each all events occurring outside the desired subset for each cascade 
24 | in a cascade object.
25 | }
26 | \examples{
27 | 
28 | data(cascades)
29 | sub_cascades <- subset_cascade_time(cascades, 10, 20, drop=TRUE)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/man/as.matrix.cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{as.matrix.cascade}
 4 | \alias{as.matrix.cascade}
 5 | \title{Convert a cascade object to a matrix}
 6 | \usage{
 7 | \method{as.matrix}{cascade}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{cascade object to convert.}
11 | 
12 | \item{...}{additional arguments to be passed to or from methods. 
13 | (Currently not supported.)}
14 | }
15 | \value{
16 | A matrix containing all cascade information in wide format. That is,
17 | each row of the matrix corresponds to a node and each column to a cascade. 
18 | Cell entries are event times. Censored nodes have \code{NA} for their entry.
19 | }
20 | \description{
21 | Generates a \code{\link{matrix}} containing the cascade information in the 
22 | cascade object in wide format. Missing values are used for nodes that do not 
23 | experience an event in a cascade.
24 | }
25 | \examples{
26 | 
27 | data(cascades) 
28 | as.matrix(cascades)
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/man/validation.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{validation}
 5 | \alias{validation}
 6 | \title{Validation output from netinf source.}
 7 | \format{An object of class \code{data.frame} with 6 columns, containing:
 8 | \describe{
 9 |   \item{origin_node}{Origin of diffusion edge.}
10 |   \item{destination_node}{Destination node of diffusion edge.}
11 |   \item{volume}{??}
12 |   \item{marginal_gain}{Marginal gain from edge.}
13 |   \item{median_time_difference}{Median time between events in origin and 
14 |       destination}
15 |   \item{mean_time_difference}{Mean time between events in origin and 
16 |       destination}
17 | }}
18 | \source{
19 | Output from netinf example program (\url{https://github.com/snap-stanford/snap/tree/master/examples/netinf}).
20 | }
21 | \usage{
22 | data(validation)
23 | }
24 | \description{
25 | Contains output from original netinf C++ implementation, executed on 
26 | \code{\link{cascades}}. For testing purposes.
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: NetworkInference
 2 | Type: Package
 3 | Title: Inferring Latent Diffusion Networks
 4 | Version: 1.2.5
 5 | Date: 2025-11-28
 6 | Authors@R: c(person("Fridolin", "Linder", email = "fridolin.linder@gmail.com",
 7 |                   role = c("aut", "cre")),
 8 |              person("Bruce", "Desmarais", role = "ctb"))
 9 | Description: This is an R implementation of the netinf algorithm (Gomez Rodriguez, Leskovec, and Krause, 2010)<doi:10.1145/1835804.1835933>. Given a set of events that spread between a set of nodes the algorithm infers the most likely stable diffusion network that is underlying the diffusion process.
10 | License: MIT + file LICENSE
11 | Imports:
12 |     Rcpp (>= 0.12.5),
13 |     assertthat,
14 |     checkmate,
15 |     ggplot2,
16 |     ggrepel,
17 |     stats
18 | LinkingTo: 
19 |     Rcpp,
20 |     RcppProgress
21 | BugReports: https://github.com/desmarais-lab/NetworkInference/issues
22 | Suggests:
23 |     testthat,
24 |     knitr,
25 |     rmarkdown,
26 |     pander,
27 |     igraph,
28 |     utils,
29 |     dplyr
30 | RoxygenNote: 6.1.1
31 | LazyData: true
32 | VignetteBuilder: knitr
33 | 


--------------------------------------------------------------------------------
/man/as.data.frame.cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{as.data.frame.cascade}
 4 | \alias{as.data.frame.cascade}
 5 | \title{Convert a cascade object to a data frame}
 6 | \usage{
 7 | \method{as.data.frame}{cascade}(x, row.names = NULL, optional = FALSE,
 8 |   ...)
 9 | }
10 | \arguments{
11 | \item{x}{Cascade object to convert.}
12 | 
13 | \item{row.names}{NULL or a character vector giving the row names for the data 
14 | frame. Missing values are not allowed.}
15 | 
16 | \item{optional}{logical. If TRUE, setting row names and converting column 
17 | names (to syntactic names: see make.names) is optional. (Not supported)}
18 | 
19 | \item{...}{Additional arguments passed to \code{\link{data.frame}}.}
20 | }
21 | \value{
22 | A data frame with three columns. Containing 1) The names of 
23 |     the nodes (\code{"node_name"}) that experience an event in each cascade, 
24 |     2) the event time (\code{"event_time"}) of the corresponding node, 
25 |     3) the cascade identifier \code{"cascade_id"}.
26 | }
27 | \description{
28 | Generates a data frame containing the cascade information in the cascade object.
29 | }
30 | \examples{
31 | 
32 | data(cascades)
33 | as.data.frame(cascades)
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/man/plot.diffnet.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot.diffnet}
 4 | \alias{plot.diffnet}
 5 | \title{Visualize netinf output}
 6 | \usage{
 7 | \method{plot}{diffnet}(x, type = "network", ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object of class diffnet to be plotted.}
11 | 
12 | \item{type}{character, one of \code{c("network", "improvement", "p-value")} 
13 | indicating if the inferred diffusion network, the 
14 | improvement for each edge or the p-value from the vuong test for each
15 | edge should be visualized .}
16 | 
17 | \item{...}{additional arguments.}
18 | }
19 | \value{
20 | A ggplot plot object if \code{type = "improvement"} otherwise an 
21 |     igraph plot.
22 | }
23 | \description{
24 | Visualize the inferred diffusion network or the marginal gain in fit obtained
25 | by addition of each edge.
26 | }
27 | \details{
28 | If `type = improvement` a ggplot object is returned. It can be modified like
29 | any other ggplot. See the ggplot documentation and the examples in 
30 | \link{plot.cascade}.
31 | }
32 | \examples{
33 | 
34 | \dontrun{
35 |  data(cascades)
36 |  res <- netinf(cascades, quiet = TRUE)
37 |  plot(res, type = "network")
38 |  plot(res, type = "improvement")
39 |  plot(res, type = "p-value")
40 | }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/distributions.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <math.h>
 3 | using namespace Rcpp;
 4 | 
 5 | double dexp_(double x, double lambda) {
 6 |     return lambda * std::exp(-1 * lambda * x);
 7 | }
 8 | 
 9 | double drayleigh_(double x, double lambda) {
10 |     return (x / pow(lambda, 2)) * std::exp(-pow(x, 2) / (2 * pow(lambda, 2)));
11 | }
12 | 
13 | double dlognormal_(double x, double mu, double sigma) {
14 |     if(x <= 0) {
15 |         std::string msg = "x outside support of log-normal distribution.\n";
16 |         throw std::invalid_argument(msg);
17 |     }
18 |     return 1 / (x*sigma*sqrt(2*M_PI)) * 
19 |         std::exp(-(pow((log(x) - mu), 2)/(2*pow(sigma, 2))));
20 | }
21 | 
22 | // https://www.johndcook.com/blog/cpp_phi/
23 | double normal_cdf(double x) {
24 |     // constants
25 |     double a1 =  0.254829592;
26 |     double a2 = -0.284496736;
27 |     double a3 =  1.421413741;
28 |     double a4 = -1.453152027;
29 |     double a5 =  1.061405429;
30 |     double p  =  0.3275911;
31 |     
32 |     // Save the sign of x
33 |     int sign = 1;
34 |     if (x < 0)
35 |     sign = -1;
36 |     x = fabs(x)/sqrt(2.0);
37 |     
38 |     // A&S formula 7.1.26
39 |     double t = 1.0/(1.0 + p*x);
40 |     double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
41 |     
42 |     return 0.5*(1.0 + sign*y);
43 | }
44 | 


--------------------------------------------------------------------------------
/netinf_benchmark.R:
--------------------------------------------------------------------------------
 1 | library(microbenchmark)
 2 | devtools::load_all()
 3 | 
 4 | cs = c(10, 50, 100, 500)
 5 | ns = c(5, 10, 50, 100)
 6 | params = data.frame(expand.grid(cs, ns), t = NA)
 7 | set.seed(123)
 8 | for(i in 1:nrow(params)) {
 9 |         df <- simulate_rnd_cascades(params[i, 1], n_nodes = params[i, 2])
10 |         cascades <- as_cascade_long(df, node_names = unique(df$node_name))
11 |         bm = microbenchmark('test' = netinf(cascades, trans_mod = "exponential", 
12 |                                             n_edges = 10, params = 1, 
13 |                                             quiet = T), times = 50)
14 |         netinf(cascades, max_iter = 1, params = 10)
15 |         t = mean(bm$time) * 1e-6
16 |         params[i, 3] = t
17 | }
18 | 
19 | # Current performance (4a506800389c2047c09eead95bc0b3224a2c0654)
20 | Var1 Var2            t
21 | 1    10    5    1.1869810
22 | 2    50    5    0.7875174
23 | 3   100    5    1.4549489
24 | 4   500    5    5.9649529
25 | 5    10   10    0.7451685
26 | 6    50   10    1.5072754
27 | 7   100   10    3.5749858
28 | 8   500   10   12.8299009
29 | 9    10   50    7.3565341
30 | 10   50   50   22.9769044
31 | 11  100   50   64.7670850
32 | 12  500   50  271.0306328
33 | 13   10  100   19.7277996
34 | 14   50  100  113.3796667
35 | 15  100  100  272.5178511
36 | 16  500  100 1712.7072782
37 | 


--------------------------------------------------------------------------------
/src/possible_edges.h:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | using namespace Rcpp;
 3 | 
 4 | typedef std::array<int, 2> id_array; 
 5 | typedef std::pair<std::vector<int>, double> edge_value;
 6 | typedef std::map<id_array, edge_value> edge_map;
 7 | 
 8 | /**
 9 |  * Finds all possible edges from a set of cascades
10 |  * 
11 |  * @param cascade_nodes A list of integer vectors containing the nodes involved 
12 |  *     in each cascade in order of infection
13 |  * @param cascade_times A list of numeric vectors containing the infection times
14 |  *     of each node in the corresponding vector in cascade_nodes
15 |  * @param quiet Should progress be reported
16 |  * 
17 |  * @return edge_map with all possible edges
18 |  */
19 | edge_map get_possible_edges_(List &cascade_nodes, List &cascade_times,
20 |                              bool &quiet);
21 | 
22 | /**
23 |  * Wrapper to count possible edges (for rcpp export)
24 |  * @param cascade_nodes A list of integer vectors containing the nodes involved 
25 |  *     in each cascade in order of infection
26 |  * @param cascade_times A list of numeric vectors containing the infection times
27 |  *     of each node in the corresponding vector in cascade_nodes
28 |  * @param quiet Should progress be reported
29 |  *     
30 |  * @return integer number of possible edges
31 |  */
32 | int count_possible_edges_(List &cascade_nodes, List &cascade_times, 
33 |                           bool &quiet);


--------------------------------------------------------------------------------
/src/netinf_utilities.h:
--------------------------------------------------------------------------------
 1 | #include <chrono>
 2 | using namespace Rcpp;
 3 | 
 4 | /**
 5 |  * Type definitions
 6 |  */
 7 | typedef std::chrono::system_clock Clock;
 8 | typedef std::chrono::system_clock::time_point time_point;
 9 | typedef std::chrono::duration<double, std::milli> time_duration;
10 | 
11 | 
12 | /**
13 |  * Find the position of an (first) integer in a vector
14 |  * 
15 |  * @param x Vector to search in.
16 |  * @param val Value to search for.
17 |  * 
18 |  * @return Integer index of location of val in x or -1 if val not in x.
19 |  */
20 | int get_index(IntegerVector x, int val);
21 | 
22 | /**
23 |  * Sum all elements of a vector, skipping na values
24 |  * 
25 |  * @param x Vector to sum over
26 |  */
27 | double sum_vector(NumericVector x);
28 | 
29 | /**
30 |  * Copy a numeric vector
31 |  */
32 | NumericVector copy_vector(NumericVector x);
33 | 
34 | /**
35 |  * Print the estimated estimation time in legible units
36 |  * 
37 |  * @param fp_ms duration of inference of a single edge in milliseconds.
38 |  * @param auto_edges Does the algorithm run in auto mode (no fixed number of 
39 |  *     edges).
40 |  * @param n_edges Number of edges to infer (only relevant if auto_edges=false).
41 |  */
42 | void print_time_estimate(std::chrono::duration<double, std::milli> fp_ms,
43 |                          bool auto_edges, int n_edges);
44 | 
45 | 
46 | time_point print_timing(time_point start_time, std::string step);


--------------------------------------------------------------------------------
/inst/CITATION:
--------------------------------------------------------------------------------
 1 | citHeader("To cite NetworkInference in publications use:")
 2 | 
 3 | bibentry(
 4 |   bibtype = "Misc",
 5 |   key     = "NetworkInference",
 6 |   title   = "{NetworkInference}: Inferring latent diffusion networks",
 7 |   author  = c(
 8 |     person("Fridolin", "Linder"),
 9 |     person("Bruce A.", "Desmarais")
10 |   ),
11 |   year    = "2017",
12 |   url     = "https://github.com/desmarais-lab/NetworkInference"
13 | )
14 | 
15 | bibentry(
16 |   bibtype = "Misc",
17 |   key     = "BoehmkeNSF",
18 |   title   = "Collaborative Research: An Expanded Framework for Inferring Public Policy Diffusion Networks.",
19 |   author  = c(
20 |     person("Frederick J.", "Boehmke"),
21 |     person("Bruce A.", "Desmarais"),
22 |     person("Jeffrey J.", "Harden"),
23 |     person("Hanna M.", "Wallach")
24 |   ),
25 |   journal = "National Science Foundation",
26 |   volume  = "SES Awards: 1558661, 1558781, 1558561, and 1558509"
27 | )
28 | 
29 | bibentry(
30 |   bibtype = "Article",
31 |   key     = "BoehmkeSPID",
32 |   title   = "A New Database for Inferring Public Policy Innovativeness and Diffusion Networks.",
33 |   author  = c(
34 |     person("Frederick J.", "Boehmke"),
35 |     person("Mark D.", "Brockway"),
36 |     person("Bruce A.", "Desmarais"),
37 |     person("Jeffrey J.", "Harden"),
38 |     person("Scott J.", "LaCombe"),
39 |     person("Fridolin", "Linder"),
40 |     person("Hanna M.", "Wallach")
41 |   ),
42 |   journal = "Presented at the 2017 Midwest Political Science Association Annual Conference",
43 |   year    = "2017"
44 | )
45 | 
46 | 


--------------------------------------------------------------------------------
/man/summary.cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/summary.R
 3 | \name{summary.cascade}
 4 | \alias{summary.cascade}
 5 | \title{Summarize a cascade object}
 6 | \usage{
 7 | \method{summary}{cascade}(object, quiet = FALSE, ...)
 8 | }
 9 | \arguments{
10 | \item{object}{object of class cascade to be summarized.}
11 | 
12 | \item{quiet}{logical, if \code{FALSE} summary stats are printed to std out.}
13 | 
14 | \item{...}{Additional arguments passed to summary.}
15 | }
16 | \value{
17 | Prints cascade summary information to the screen
18 |     (if \code{quiet = FALSE}). \code{'# cascades'} is the number of cascades in
19 |     the object, \code{'# nodes'} is the number of nodes in the system (nodes 
20 |     that can theoretically experience an event), \code{'# nodes in cascades'} is 
21 |     the number of unique nodes of the system that experienced an event and 
22 |     \code{'# possible edges'} is the number of edges that are possible given
23 |     the cascade data (see \code{\link{count_possible_edges}} for details.). 
24 |     
25 |     Additional summaries for each cascade are returned invisibly. 
26 |     cascade), \code{length} (length of the cascade as an integer of how many
27 |     nodes experienced and event) and \code{n_ties} (number of tied event 
28 |     times per cascade).
29 | }
30 | \description{
31 | Generates summary statistics for single cascades and across cascades in a 
32 | collection, contained in a cascades object.
33 | }
34 | \examples{
35 | data(cascades)
36 | summary(cascades)
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/man/plot.cascade.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot.cascade}
 4 | \alias{plot.cascade}
 5 | \title{Plot a cascade object}
 6 | \usage{
 7 | \method{plot}{cascade}(x, label_nodes = TRUE, selection = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{object of class cascade to be plotted.}
11 | 
12 | \item{label_nodes}{logical, indicating if should the nodes in each cascade be 
13 | labeled. If the cascades are very dense setting this to \code{FALSE} is
14 | recommended.}
15 | 
16 | \item{selection}{a vector of cascade ids to plot.}
17 | 
18 | \item{...}{additional arguments passed to plot.}
19 | }
20 | \value{
21 | A ggplot plot object.
22 | }
23 | \description{
24 | Allows plotting of one or multiple, labeled or unlabeled cascades.
25 | }
26 | \details{
27 | The function returns a ggplot plot object (class \code{gg, ggplot}) which 
28 | can be modified like any other ggplot. See the ggplot documentation and the 
29 | examples below for more details.
30 | }
31 | \examples{
32 | 
33 | data(cascades)
34 | plot(cascades, selection = names(cascades$cascade_nodes)[1:5])
35 | plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20))
36 | 
37 | # Modify resulting ggplot object
38 | library(ggplot2) 
39 | p <- plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20))
40 | ## Add a title
41 | p <- p + ggtitle('Your Title')
42 | p
43 | ## Change Axis
44 | p <- p + xlab("Your modified y axis label") #x and y labels are flipped here
45 | p <- p + ylab("Your modified x axis label") #x and y labels are flipped here
46 | p
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/tests/testthat/test_simulation.R:
--------------------------------------------------------------------------------
 1 | #library(NetworkInference)
 2 | #
 3 | #context("Test if simulation methods works")
 4 | #
 5 | #if (requireNamespace("igraph", quietly = TRUE)) {
 6 | #    data(cascades)
 7 | #    test_that("Simulation function works.", {
 8 | #        from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential",
 9 | #                              n_edges = 5, quiet = TRUE)
10 | #        set.seed(123)
11 | #        out <- simulate_cascades(from_netinf, nsim = 100, max_time = 10)
12 | #        casc <- as_cascade_long(out)
13 | #        rec <- netinf(casc, params = 1, trans_mod = "exponential", n_edges = 5,
14 | #                      quiet = TRUE)
15 | #        rec <- rec[order(as.numeric(rec[, 1])), ]
16 | #        rec <- rec[order(as.numeric(rec[, 2])), ]
17 | #        from_netinf <- from_netinf[order(as.numeric(from_netinf[, 1])), ]
18 | #        from_netinf <- from_netinf[order(as.numeric(from_netinf[, 2])), ]
19 | #        rownames(rec) <- rownames(from_netinf) <- NULL
20 | #        expect_equal(from_netinf[, c(-3, -4)], rec[, c(-3, -4)])
21 | #    })
22 | #    test_that("Simulation function with partial cascade works.", {
23 | #        partial_cascade <- cascades
24 | #        partial_cascade$cascade_nodes <- cascades$cascade_nodes[10]
25 | #        partial_cascade$cascade_times <- cascades$cascade_times[10]
26 | #        from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential",
27 | #                              n_edges = 100, quiet = TRUE)   
28 | #        out <- simulate_cascades(from_netinf, nsim = 100, max_time = 10, 
29 | #                                 partial_cascade = partial_cascade)
30 | #        expect_equal(length(unique(out[out$node_name == "22", 2])), 1)
31 | #        expect_equal(length(unique(out[out$node_name == "28", 2])), 1)
32 | #    })
33 | #}
34 | 


--------------------------------------------------------------------------------
/R/NetworkInference.R:
--------------------------------------------------------------------------------
 1 | #' NetworkInference: Inferring latent diffusion networks
 2 | #' 
 3 | #' This package provides an R implementation of the \code{netinf} algorithm 
 4 | #' created by Gomez Rodriguez, Leskovec, and  Krause (2010). Given a set of
 5 | #' events that spread between a set of nodes the algorithm infers the most likely
 6 | #' stable diffusion network that is underlying the diffusion process.
 7 | #' 
 8 | #' The package provides three groups of functions: 1) data preparation 
 9 | #' 2) estimation and 3) interpretation.
10 | #' 
11 | #' @section Data preparation:
12 | #' 
13 | #' The core estimation function \code{\link{netinf}} requires an object of class 
14 | #' \code{cascade} (see \link{as_cascade_long} and \link{as_cascade_wide}). 
15 | #' Cascade data contains information on the potential nodes in the network as 
16 | #' well as on event times for each node in each cascade. 
17 | #' 
18 | #' @section Estimation:
19 | #' 
20 | #' Diffusion networks are estimated using the \code{\link{netinf}} function. It 
21 | #' produces a diffusion network in form of an edgelist (of class 
22 | #' \code{\link{data.frame}}).
23 | #' 
24 | #' @section Interpretation and Visualization:
25 | #' 
26 | #' Cascade data can be visualized with the \code{plot} method of the \code{cascade}
27 | #' class (\code{diffnet, \link{plot.cascade}}). Results of the estimation process can 
28 | #' be visualized using the plotting method of the \code{diffnet} class. 
29 | #' 
30 | #' @section Performance:
31 | #' 
32 | #' If higher performance is required and for very large data sets, a faster pure C++ 
33 | #' implementation is available in the Stanford Network Analysis Project (SNAP). 
34 | #' The software can be downloaded at \url{http://snap.stanford.edu/netinf/}.
35 | #' 
36 | #' @useDynLib NetworkInference, .registration = TRUE
37 | #' @importFrom Rcpp sourceCpp
38 | #' @name NetworkInference
39 | NULL


--------------------------------------------------------------------------------
/man/NetworkInference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/NetworkInference.R
 3 | \name{NetworkInference}
 4 | \alias{NetworkInference}
 5 | \title{NetworkInference: Inferring latent diffusion networks}
 6 | \description{
 7 | This package provides an R implementation of the \code{netinf} algorithm 
 8 | created by Gomez Rodriguez, Leskovec, and  Krause (2010). Given a set of
 9 | events that spread between a set of nodes the algorithm infers the most likely
10 | stable diffusion network that is underlying the diffusion process.
11 | }
12 | \details{
13 | The package provides three groups of functions: 1) data preparation 
14 | 2) estimation and 3) interpretation.
15 | }
16 | \section{Data preparation}{
17 | 
18 | 
19 | The core estimation function \code{\link{netinf}} requires an object of class 
20 | \code{cascade} (see \link{as_cascade_long} and \link{as_cascade_wide}). 
21 | Cascade data contains information on the potential nodes in the network as 
22 | well as on event times for each node in each cascade.
23 | }
24 | 
25 | \section{Estimation}{
26 | 
27 | 
28 | Diffusion networks are estimated using the \code{\link{netinf}} function. It 
29 | produces a diffusion network in form of an edgelist (of class 
30 | \code{\link{data.frame}}).
31 | }
32 | 
33 | \section{Interpretation and Visualization}{
34 | 
35 | 
36 | Cascade data can be visualized with the \code{plot} method of the \code{cascade}
37 | class (\code{diffnet, \link{plot.cascade}}). Results of the estimation process can 
38 | be visualized using the plotting method of the \code{diffnet} class.
39 | }
40 | 
41 | \section{Performance}{
42 | 
43 | 
44 | If higher performance is required and for very large data sets, a faster pure C++ 
45 | implementation is available in the Stanford Network Analysis Project (SNAP). 
46 | The software can be downloaded at \url{http://snap.stanford.edu/netinf/}.
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/vignettes/quickstart_vignette.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "NetworkInference: Quick Start Guide"
 3 | author: "Fridolin Linder"
 4 | date: "`r Sys.Date()`"
 5 | output: rmarkdown::html_vignette
 6 | vignette: >
 7 |   %\VignetteIndexEntry{NetworkInference: Quick Start Guide}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ---
13 | 
14 | # Introduction
15 | 
16 | ---
17 | 
18 | This package provides an R implementation of the netinf algorithm 
19 | created by @gomez2010inferring (see [here](http://snap.stanford.edu/netinf/) 
20 | for more information and the original C++ implementation). Given a set of events 
21 | that spread between a set of nodes the algorithm infers the most likely stable 
22 | diffusion network that is underlying the diffusion process.
23 | 
24 | ---
25 | 
26 | # Installation
27 | 
28 | ---
29 | 
30 | The package can be installed from [CRAN](https://CRAN.R-project.org/):
31 | ```{r, eval=FALSE}
32 | install.packages("NetworkInference")
33 | ```
34 | 
35 | The latest development version can be installed from 
36 | [github](https://github.com/desmarais-lab/NetworkInference):
37 | ```{r, eval=FALSE}
38 | #install.packages(devtools)
39 | devtools::install_github('desmarais-lab/NetworkInference')
40 | ```
41 | 
42 | ---
43 | 
44 | # Quick start guide
45 | 
46 | ---
47 | 
48 | To get started, get your data into the `cascades` format required by the `netinf` 
49 | function:
50 | 
51 | ```{r, results='hide', message=FALSE}
52 | library(NetworkInference)
53 | 
54 | # Simulate random cascade data
55 | df <- simulate_rnd_cascades(50, n_node = 20)
56 | 
57 | # Cast data into `cascades` object
58 | ## From long format
59 | cascades <- as_cascade_long(df)
60 | 
61 | ## From wide format
62 | df_matrix <- as.matrix(cascades) ### Create example matrix
63 | cascades <- as_cascade_wide(df_matrix)
64 | ```
65 | 
66 | Then fit the model:
67 | ```{r}
68 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05)
69 | ```
70 | 
71 | ```{r, eval=FALSE}
72 | head(result)
73 | ```
74 | ```{r, results="asis", echo=FALSE}
75 | pander::pandoc.table(head(result))
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 3 | 
 4 | #include <Rcpp.h>
 5 | 
 6 | using namespace Rcpp;
 7 | 
 8 | // netinf_
 9 | List netinf_(List& cascade_nodes, List& cascade_times, int& n_edges, std::string& model, NumericVector& params, bool quiet, bool& auto_edges, double& cutoff);
10 | RcppExport SEXP _NetworkInference_netinf_(SEXP cascade_nodesSEXP, SEXP cascade_timesSEXP, SEXP n_edgesSEXP, SEXP modelSEXP, SEXP paramsSEXP, SEXP quietSEXP, SEXP auto_edgesSEXP, SEXP cutoffSEXP) {
11 | BEGIN_RCPP
12 |     Rcpp::RObject rcpp_result_gen;
13 |     Rcpp::RNGScope rcpp_rngScope_gen;
14 |     Rcpp::traits::input_parameter< List& >::type cascade_nodes(cascade_nodesSEXP);
15 |     Rcpp::traits::input_parameter< List& >::type cascade_times(cascade_timesSEXP);
16 |     Rcpp::traits::input_parameter< int& >::type n_edges(n_edgesSEXP);
17 |     Rcpp::traits::input_parameter< std::string& >::type model(modelSEXP);
18 |     Rcpp::traits::input_parameter< NumericVector& >::type params(paramsSEXP);
19 |     Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
20 |     Rcpp::traits::input_parameter< bool& >::type auto_edges(auto_edgesSEXP);
21 |     Rcpp::traits::input_parameter< double& >::type cutoff(cutoffSEXP);
22 |     rcpp_result_gen = Rcpp::wrap(netinf_(cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff));
23 |     return rcpp_result_gen;
24 | END_RCPP
25 | }
26 | // count_possible_edges_
27 | int count_possible_edges_(List& cascade_nodes, List& cascade_times, bool quiet);
28 | RcppExport SEXP _NetworkInference_count_possible_edges_(SEXP cascade_nodesSEXP, SEXP cascade_timesSEXP, SEXP quietSEXP) {
29 | BEGIN_RCPP
30 |     Rcpp::RObject rcpp_result_gen;
31 |     Rcpp::RNGScope rcpp_rngScope_gen;
32 |     Rcpp::traits::input_parameter< List& >::type cascade_nodes(cascade_nodesSEXP);
33 |     Rcpp::traits::input_parameter< List& >::type cascade_times(cascade_timesSEXP);
34 |     Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
35 |     rcpp_result_gen = Rcpp::wrap(count_possible_edges_(cascade_nodes, cascade_times, quiet));
36 |     return rcpp_result_gen;
37 | END_RCPP
38 | }
39 | 


--------------------------------------------------------------------------------
/man/as_cascade_wide.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{as_cascade_wide}
 4 | \alias{as_cascade_wide}
 5 | \title{Transform wide data to cascade}
 6 | \usage{
 7 | as_cascade_wide(data, node_names = NULL)
 8 | }
 9 | \arguments{
10 | \item{data}{\link{data.frame} or \link{matrix}, rows corresponding to nodes, 
11 | columns to cascades. Matrix entries are the event times for each node, 
12 | cascade pair. Missing values indicate censored observations, that is, 
13 | nodes that did not have an event). Specify column and row names if 
14 | cascade and node ids other than integer sequences are  desired. Note that, 
15 | if the time column is of class date or any other special time class, it 
16 | will be converted to an integer with `as.numeric()`.}
17 | 
18 | \item{node_names}{character, factor or numeric vector, containing names for each node. 
19 | Optional. If not provided, node names are inferred from the provided data.}
20 | }
21 | \value{
22 | An object of class \code{cascade}. This is a list containing three
23 |     (named) elements: 
24 |     \enumerate{
25 |         \item \code{"node_names"} A character vector of node names.
26 |         \item \code{"cascade_nodes"} A list with one character vector per
27 |             cascade containing the node names in order of the events.
28 |         \item \code{"cascade_times"} A list with one element per cascade 
29 |             containing the event times for the nodes in \code{"cascade_names"}.
30 |     }
31 | }
32 | \description{
33 | Create a cascade object from data in wide format.
34 | }
35 | \details{
36 | If data is in wide format, each row corresponds to a node and each column to
37 | a cascade. Each cell indicates the event time for a node - cascade 
38 | combination. If a node did not experience an event for a cascade (the node
39 | is censored) the cell entry must be \code{NA}.
40 | }
41 | \examples{
42 | 
43 | data("policies")
44 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
45 |                             event_time = 'adopt_year', cascade_id = 'policy')
46 | wide_policies = as.matrix(cascades)
47 | cascades <- as_cascade_wide(wide_policies)
48 | is.cascade(cascades)
49 | 
50 | }
51 | 


--------------------------------------------------------------------------------
/src/netinf_utilities.cpp:
--------------------------------------------------------------------------------
 1 | #include <Rcpp.h>
 2 | #include <cmath>
 3 | #include <string>
 4 | #include <math.h>
 5 | #include <numeric>
 6 | #include <chrono>
 7 | #include "netinf_utilities.h"
 8 | 
 9 | using namespace Rcpp;
10 | 
11 | 
12 | NumericVector init_parameters(NumericVector &cascade_times, std::string model) {
13 |     NumericVector out;
14 |     return out;
15 | }
16 | 
17 | 
18 | int get_index(IntegerVector x, int val) {
19 |     for(int i = 0; i < x.size(); i++) {
20 |         if(x[i] == val) return i;
21 |     }
22 |     return -1;
23 | }
24 | 
25 | double sum_vector(NumericVector x) {
26 |     double out = 0;
27 |     for(int i = 0; i < x.size(); i++)  {
28 |         if(std::isnan(x[i])) continue;
29 |         out += x[i];
30 |     }
31 |     return out;
32 | }
33 | 
34 | NumericVector copy_vector(NumericVector x) {
35 |     NumericVector out(x.size());
36 |     for(int i = 0; i < x.size(); i++) out[i] = x[i];
37 |     return out;
38 | }
39 | 
40 | void print_time_estimate(std::chrono::duration<double, std::milli> fp_ms,
41 |                          bool auto_edges, int n_edges) {
42 |     float estimate;
43 |     std::string message;
44 |     if(auto_edges) {
45 |         estimate = fp_ms.count();
46 |         message = "Estimated time per edge: ";
47 |     } else {
48 |         estimate = fp_ms.count() * n_edges;
49 |         message = "Estimated completion time: ";
50 |     }
51 |     std::string unit = "milliseconds";
52 |     if ((estimate > 1000) & (estimate < 60000)) {
53 |         estimate /= 1000;  
54 |         unit = "seconds";
55 |     } else if ((estimate > 60000) & (estimate < 3600000)) {
56 |         estimate /= 60000;
57 |         unit = "minutes";
58 |     } else if ((estimate > 3600000) & (estimate < 86400000)) {
59 |         estimate /= 3600000;
60 |         unit = "hours";
61 |     } else if (estimate > 86400000) {
62 |         estimate /= 86400000;
63 |         unit = "days";
64 |     }
65 |     float out = roundf(estimate * 100) / 100;
66 |     Rcout << message << out << " " << unit << ".\n";
67 | }
68 | 
69 | time_point print_timing(time_point start_time, std::string step) {
70 |     time_point t2 = Clock::now();
71 |     time_duration fp_ms = t2 - start_time;
72 |     Rcout << step << "took: " << fp_ms.count() << "ms\n";
73 |     return t2;
74 | }


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, echo = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "readme_figures/README-"
12 | )
13 | ```
14 | 
15 | # NetworkInference: Inferring Latent Diffusion Networks
16 | 
17 | ![](https://travis-ci.org/desmarais-lab/NetworkInference.svg) ![](http://www.r-pkg.org/badges/version/NetworkInference)
18 | ![](http://cranlogs.r-pkg.org/badges/NetworkInference)
19 | ![](http://cranlogs.r-pkg.org/badges/grand-total/NetworkInference?color=yellow)
20 | 
21 | ## About
22 | 
23 | This package provides an R implementation of the netinf algorithm 
24 | created by Gomez-Rodriguez, Leskovec, and Krause (see 
25 | [here](http://snap.stanford.edu/netinf/) for more information and the original 
26 | C++ implementation). Given a set of events that spread between a set of nodes 
27 | the algorithm infers the most likely stable diffusion network that is underlying 
28 | the diffusion process.
29 | 
30 | ## Installation
31 | 
32 | The package can be installed from [CRAN](https://CRAN.R-project.org/):
33 | ```{r, eval=FALSE}
34 | install.packages("NetworkInference")
35 | ```
36 | 
37 | The latest development version can be installed from 
38 | [github](https://github.com/desmarais-lab/NetworkInference):
39 | ```{r, eval=FALSE}
40 | #install.packages(devtools)
41 | devtools::install_github('desmarais-lab/NetworkInference')
42 | ```
43 | 
44 | ## Quick start guide
45 | 
46 | To get started, get your data into the `cascades` format required by the `netinf` 
47 | function:
48 | 
49 | ```{r, results='hide', message=FALSE}
50 | library(NetworkInference)
51 | 
52 | # Simulate random cascade data
53 | df <- simulate_rnd_cascades(50, n_node = 20)
54 | 
55 | # Cast data into `cascades` object
56 | ## From long format
57 | cascades <- as_cascade_long(df)
58 | 
59 | ## From wide format
60 | df_matrix <- as.matrix(cascades) ### Create example matrix
61 | cascades <- as_cascade_wide(df_matrix)
62 | ```
63 | 
64 | Then fit the model:
65 | ```{r}
66 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05)
67 | ```
68 | 
69 | ```{r, eval=FALSE}
70 | head(result)
71 | ```
72 | ```{r, results="asis", echo=FALSE}
73 | pander::pandoc.table(head(result))
74 | ```


--------------------------------------------------------------------------------
/tests/testthat/test_cascade.R:
--------------------------------------------------------------------------------
 1 | library(NetworkInference)
 2 | 
 3 | set.seed(2552)
 4 | context("Test if cascade data structure and related methods work.")
 5 | 
 6 | n_casc <- 100
 7 | n_nodes <- 100
 8 | 
 9 | # Long format
10 | test_that("Long format", {
11 |     dat <- simulate_rnd_cascades(n_casc, n_nodes)
12 |     casc <- as_cascade_long(data = dat, cascade_node_name = "node_name", 
13 |                             event_time = "event_time", 
14 |                             cascade_id = "cascade_id")
15 |     df <- as.data.frame(casc)
16 |     df <- df[order(df$cascade_id, df$node_name), ]
17 |     dat$cascade_id <- as.character(dat$cascade_id)
18 |     dat <- dat[order(dat$cascade_id, dat$node_name), ]
19 |     rownames(dat) <- c(1:nrow(dat))
20 |     rownames(df) <- c(1:nrow(df))
21 |     expect_equal(dat, df)    
22 | })
23 | 
24 | # Wide format
25 | test_that("Wide format", {
26 |     dat <- simulate_rnd_cascades(n_casc, n_nodes)
27 |     casc <- as_cascade_long(data = dat, cascade_node_name = "node_name", 
28 |                             event_time = "event_time", 
29 |                             cascade_id = "cascade_id")
30 |     m <- as.matrix(casc)
31 |     cascade <- as_cascade_wide(m)
32 |     expect_equal(casc, cascade)
33 | })
34 | 
35 | # Subsetting
36 | test_that("Subsetting works", {
37 |     data("cascades") 
38 |     reduced_cascade <- subset_cascade_time(cascades, 10, 20, drop=TRUE)
39 |     expect_equal(length(reduced_cascade), length(cascades))
40 |     expect_equal(length(reduced_cascade$cascade_nodes), length(reduced_cascade$cascade_times))
41 |     expect_equal(min(do.call(c, reduced_cascade$cascade_times)), 10.1, tolerance = 0.1)
42 |     expect_equal(max(do.call(c, reduced_cascade$cascade_times)), 19.8, tolerance = 0.1)
43 |     reduced_cascade <- subset_cascade_time(cascades, 10, 20, drop=FALSE)
44 |     expect_equal(length(reduced_cascade), length(cascades))
45 |     expect_equal(length(reduced_cascade$cascade_nodes), length(reduced_cascade$cascade_times))
46 |     expect_equal(length(reduced_cascade$cascade_nodes), length(cascades$cascade_nodes))
47 |     expect_equal(length(reduced_cascade$cascade_times), length(cascades$cascade_times))
48 |     expect_equal(min(do.call(c, reduced_cascade$cascade_times)), 10.1, tolerance = 0.1)
49 |     expect_equal(max(do.call(c, reduced_cascade$cascade_times)), 19.8, tolerance = 0.1)
50 |     expect_warning(as.data.frame(reduced_cascade))
51 | })
52 | 


--------------------------------------------------------------------------------
/man/simulate_cascades.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/simulate_cascades.R
 3 | \name{simulate_cascades}
 4 | \alias{simulate_cascades}
 5 | \title{Simulate cascades from a diffusion network}
 6 | \usage{
 7 | simulate_cascades(diffnet, nsim = 1, max_time = Inf,
 8 |   start_probabilities = NULL, partial_cascade = NULL, params = NULL,
 9 |   model = NULL, nodes = NULL)
10 | }
11 | \arguments{
12 | \item{diffnet}{object of class \code{diffnet}.}
13 | 
14 | \item{nsim}{integer, number of cascades to simulate.}
15 | 
16 | \item{max_time}{numeric, the maximum time after which observations are 
17 | censored}
18 | 
19 | \item{start_probabilities}{a vector of probabilities for each node in diffnet,
20 | to be the node with the first event. If \code{NULL} a node is drawn from
21 | a uniform distribution over all nodes.}
22 | 
23 | \item{partial_cascade}{object of type cascade, containing one partial 
24 | cascades for which further development should be simulated.}
25 | 
26 | \item{params}{numeric, (optional) parameters for diffusion time distribution. 
27 | See the details section of \code{\link{netinf}} for specification details.
28 | Only use this argument if parameters different from those contained in the 
29 | \code{diffnet} object should be used or the network is not an object of 
30 | class \code{diffnet}.}
31 | 
32 | \item{model}{character, diffusion model to use. One of \code{c("exponential", 
33 | "rayleigh", "log-normal")}. Only use this argument if parameters different 
34 | from those contained in the \code{diffnet} object should be used or the 
35 | network is not an object of class \code{diffnet}.}
36 | 
37 | \item{nodes}{vector of node ids if different from nodes included in 
38 | \code{diffnet}}
39 | }
40 | \value{
41 | A data frame with three columns. Containing 1) The names of 
42 |     the nodes (\code{"node_name"}) that experience an event in each cascade, 
43 |     2) the event time (\code{"event_time"}) of the corresponding node, 
44 |     3) the cascade identifier \code{"cascade_id"}.
45 | }
46 | \description{
47 | Simulate diffusion cascades based on the generative model underlying netinf
48 | and a diffusion network.
49 | }
50 | \examples{
51 | 
52 | data(cascades) 
53 | out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1)
54 | simulated_cascades <- simulate_cascades(out, nsim = 10)
55 |  
56 | # Simulation from partial cascade
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # NetworkInference: Inferring Latent Diffusion Networks
 5 | 
 6 | ![](https://travis-ci.org/desmarais-lab/NetworkInference.svg)
 7 | ![](http://www.r-pkg.org/badges/version/NetworkInference)
 8 | ![](http://cranlogs.r-pkg.org/badges/NetworkInference)
 9 | ![](http://cranlogs.r-pkg.org/badges/grand-total/NetworkInference?color=yellow)
10 | 
11 | ## About
12 | 
13 | This package provides an R implementation of the netinf algorithm
14 | created by Gomez-Rodriguez, Leskovec, and Krause (see
15 | [here](http://snap.stanford.edu/netinf/) for more information and the
16 | original C++ implementation). Given a set of events that spread between
17 | a set of nodes the algorithm infers the most likely stable diffusion
18 | network that is underlying the diffusion process.
19 | 
20 | ## Installation
21 | 
22 | The package can be installed from [CRAN](https://CRAN.R-project.org/):
23 | 
24 | ``` r
25 | install.packages("NetworkInference")
26 | ```
27 | 
28 | The latest development version can be installed from
29 | [github](https://github.com/desmarais-lab/NetworkInference):
30 | 
31 | ``` r
32 | #install.packages(devtools)
33 | devtools::install_github('desmarais-lab/NetworkInference')
34 | ```
35 | 
36 | ## Quick start guide
37 | 
38 | To get started, get your data into the `cascades` format required by the
39 | `netinf` function:
40 | 
41 | ``` r
42 | library(NetworkInference)
43 | 
44 | # Simulate random cascade data
45 | df <- simulate_rnd_cascades(50, n_node = 20)
46 | 
47 | # Cast data into `cascades` object
48 | ## From long format
49 | cascades <- as_cascade_long(df)
50 | 
51 | ## From wide format
52 | df_matrix <- as.matrix(cascades) ### Create example matrix
53 | cascades <- as_cascade_wide(df_matrix)
54 | ```
55 | 
56 | Then fit the model:
57 | 
58 | ``` r
59 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05)
60 | ```
61 | 
62 | ``` r
63 | head(result)
64 | ```
65 | 
66 | | origin\_node | destination\_node | improvement | p\_value  |
67 | | :----------: | :---------------: | :---------: | :-------: |
68 | |      20      |         7         |    290.1    | 7.324e-06 |
69 | |      8       |        17         |     272     | 1.875e-05 |
70 | |      3       |         2         |    270.5    | 1.87e-05  |
71 | |      20      |         5         |    262.8    | 1.899e-05 |
72 | |      7       |        16         |    250.4    | 4.779e-05 |
73 | |      20      |        15         |     249     | 4.774e-05 |
74 | 


--------------------------------------------------------------------------------
/src/possible_edges.cpp:
--------------------------------------------------------------------------------
 1 | // [[Rcpp::depends(RcppProgress)]]
 2 | #include <progress.hpp>
 3 | #include <progress_bar.hpp>
 4 | #include <Rcpp.h>
 5 | #include <array>
 6 | #include "possible_edges.h"
 7 | 
 8 | using namespace Rcpp;
 9 | 
10 | edge_map get_possible_edges_(List &cascade_nodes, List &cascade_times, 
11 |                              bool& quiet) {
12 |     edge_map possible_edges;
13 |     int n_cascades = cascade_nodes.size();
14 |     if(!quiet) Rcout << "Getting possible edges...\n";
15 |     Progress p(n_cascades, !quiet);
16 |     for(int c = 0; c < n_cascades; c++) {
17 |         checkUserInterrupt();
18 |         IntegerVector this_cascade_nodes = cascade_nodes[c];
19 |         NumericVector this_cascade_times = cascade_times[c];
20 |         int csize = this_cascade_nodes.size();
21 |         
22 |         // Use the fact that the cascade data is ordered (see cascade.R)
23 |         for(int i = 0; i < csize; i++) {
24 |             int parent = this_cascade_nodes[i];
25 |             double t_parent = this_cascade_times[i];
26 |             for(int j = i + 1; j < csize; j++) {
27 |                 int child = this_cascade_nodes[j];
28 |                 double t_child = this_cascade_times[j];
29 |                 
30 |                 // If times are tied skip this combination
31 |                 if(t_parent >= t_child) {
32 |                     continue;
33 |                 }
34 |                 
35 |                 // Check if pair is in pair collection. If not include
36 |                 std::array<int, 2> pair_id = {{child, parent}};
37 |                 
38 |                 auto it = possible_edges.find(pair_id);
39 |                 if(it == possible_edges.end()) {
40 |                     std::vector<int> possible_cascades;
41 |                     possible_cascades.push_back(c);
42 |                     double improvement = -1;
43 |                     edge_value value = make_pair(possible_cascades, 
44 |                                                  improvement);
45 |                     possible_edges.insert(make_pair(pair_id, value));
46 |                 } else {
47 |                     it->second.first.push_back(c);
48 |                 }
49 |             }
50 |         }
51 |         p.increment();
52 |     }
53 |     return possible_edges;
54 | }
55 | 
56 | //[[Rcpp::export]]
57 | int count_possible_edges_(List &cascade_nodes, List &cascade_times, 
58 |                           bool quiet=true) {
59 |     edge_map edges = get_possible_edges_(cascade_nodes, cascade_times, quiet);
60 |     return edges.size();
61 | }
62 | 


--------------------------------------------------------------------------------
/src/spanning_tree.h:
--------------------------------------------------------------------------------
 1 | using namespace Rcpp;
 2 | 
 3 | /**
 4 |  * Calculate the weighted log likelihood of an edge (i -> j) in a tree.
 5 |  * 
 6 |  * @param event_time_i Time node i experienced the event
 7 |  * @param event_time_j Time node j experienced the event
 8 |  * @param model The diffusion model.
 9 |  * @param params Parameters of the density funtion of the diffusion 
10 |  *     model.
11 |  * @param tied Is the edge tied in the tree/cascade or is it out of network 
12 |  *     diffusion
13 |  *     
14 |  * @return Weighted log-likelihood score of the edge
15 |  */
16 | double edge_score(double &event_time_i, double &event_time_j, std::string &model,
17 |                   NumericVector &params, bool tied);
18 | 
19 | /**
20 |  * Generate the optimal spanning tree for a cascade.
21 |  * 
22 |  * @param cascade_nodes Integer vector of node ids in the order in which they 
23 |  *     experienced the event.
24 |  * @param cascade_times The event times for the corresponding nodes in 
25 |  *     cascade_nodes.
26 |  * @param model The diffusion model.
27 |  * @param params Parameters of the density funtion of the diffusion 
28 |  *     model.
29 |  * 
30 |  * @return A list containing two vectors and a scalar: 
31 |  *    [0] Integer vector of parent ids. Each id is the parent of the node in the 
32 |  *    corresponding position of the input `cascade_nodes`
33 |  *    [1] Numeric vector of likelihood scores. Each score is the score for the 
34 |  *    edge form the node in [0] to the node in the respective position in input 
35 |  *    `cascade_nodes`
36 |  *    [2] The total score of the tree (sum of all edge scores)
37 |  */
38 | List optimal_spanning_tree(IntegerVector &cascade_nodes, 
39 |                             NumericVector &cascade_times, std::string &model, 
40 |                             NumericVector &params);
41 | 
42 | /**
43 |  * Construct the optimal spanning tree for all cascades
44 |  * 
45 |  * @param cascade_nodes List of integer vectors of node ids in the order in 
46 |  *     which they experienced the event in the respective cascade
47 |  * @param cascade_times List of numeric vectors containing the event times 
48 |  *     for the corresponding nodes in cascade_nodes.
49 |  * @param params Parameters of the density funtion of the diffusion 
50 |  *     model.
51 |  * @param model The diffusion model.
52 |  * 
53 |  * @returns A list containing the optimal spanning tree for each cascade (see
54 |  *     optimal spanning tree for the data format of each tree)
55 |  */
56 | List initialize_trees(List &cascade_nodes, List &cascade_times, 
57 |                       NumericVector &params, std::string &model);


--------------------------------------------------------------------------------
/man/as_cascade_long.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cascade.R
 3 | \name{as_cascade_long}
 4 | \alias{as_cascade_long}
 5 | \title{Transform long data to cascade}
 6 | \usage{
 7 | as_cascade_long(data, cascade_node_name = "node_name",
 8 |   event_time = "event_time", cascade_id = "cascade_id",
 9 |   node_names = NULL)
10 | }
11 | \arguments{
12 | \item{data}{\link{data.frame}, containing the cascade data 
13 | with column names corresponding to the arguments provided to 
14 | \code{cascade_node_names}, \code{event_time} and \code{cascade_id}.}
15 | 
16 | \item{cascade_node_name}{character, column name of \code{data} that specifies 
17 | the node names in the cascade.}
18 | 
19 | \item{event_time}{character, column name of \code{data} that specifies the 
20 | event times for each node involved in a cascade.}
21 | 
22 | \item{cascade_id}{character, column name of the cascade identifier.}
23 | 
24 | \item{node_names}{character, factor or numeric vector containing the names for each node. 
25 | Optional. If not provided, node names are inferred from the cascade data.}
26 | }
27 | \value{
28 | An object of class \code{cascade}. This is a list containing three
29 |     (named) elements: 
30 |     \enumerate{
31 |         \item \code{"node_names"} A character vector of node names.
32 |         \item \code{"cascade_nodes"} A list with one character vector per
33 |             cascade containing the node names in order of the events.
34 |         \item \code{"cascade_times"} A list with one element per cascade 
35 |             containing the event times for the nodes in \code{"cascade_names"}.
36 |     }
37 | }
38 | \description{
39 | Create a cascade object from data in long format.
40 | }
41 | \details{
42 | Each row of the data describes one event in the cascade. The data must 
43 | contain at least three columns:
44 | \enumerate{
45 |    \item Cascade node name: The identifier of the node that experiences the 
46 |        event.
47 |    \item Event time: The time when the node experiences the event. Note that
48 |        if the time column is of class date or any other special time class, 
49 |        it will be converted to an integer with `as.numeric()`. 
50 |    \item Cascade id: The identifier of the cascade that the event pertains to.
51 | }
52 | The default names for these columns are \code{node_name}, \code{event_time} 
53 | and \code{cascade_id}. If other names are used in the \code{data} object the 
54 | names have to be specified in the corresponding arguments (see argument 
55 | documentation)
56 | }
57 | \examples{
58 | 
59 | df <- simulate_rnd_cascades(10, n_nodes = 20)
60 | cascades <- as_cascade_long(df)
61 | is.cascade(cascades)
62 | }
63 | 


--------------------------------------------------------------------------------
/man/policies.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/data.R
 3 | \docType{data}
 4 | \name{policies}
 5 | \alias{policies}
 6 | \alias{policies_metadata}
 7 | \title{US State Policy Adoption (SPID)}
 8 | \format{The data comes in two objects of class \code{data.frame}. The first
 9 |     object, named \code{policies} contains the adoption events. Each row 
10 |     corresponds to an adoption event. Each adoption event is described by 
11 |     the three columns: 
12 | \itemize{
13 |     \item \code{statenam}: Name of the adopting state.
14 |     \item \code{policy}: Name of the policy.
15 |     \item \code{adopt_year}: Year when the state adopted the policy.
16 | }
17 | The second object (\code{policies_metadata}) contains more details on each
18 | of the policies. It contains these columns:
19 | \itemize{
20 |     \item \code{policy}: Name of the policy.
21 |     \item \code{source}: Original source of the data.
22 |     \item \code{first_year}: First year any state adopted this policy.
23 |     \item \code{last_year}: Last year any state adopted this policy.
24 |     \item \code{adopt_count}: Number of states that adopted this policy.
25 |     \item \code{description}: Description of the policy.
26 |     \item \code{majortopic}: Topic group the policy belongs to.
27 | } 
28 | Both \code{data.frame} objects can be joined (merged) on the common column
29 | \code{policy} (see example code).}
30 | \source{
31 | \doi{10.7910/DVN/CVYSR7}
32 | }
33 | \usage{
34 | data(policies)
35 | }
36 | \description{
37 | The SPID data includes information on the year of adoption for over 700 
38 | policies in the American states.
39 | }
40 | \details{
41 | This version 1.0 of the database. For each policy we document the year of first 
42 | adoption for each state. Adoption dates range from 1691 to 2017 and includes 
43 | all fifty states. Policies are adopted by anywhere from 1 to 50 states, with 
44 | an average of 24 adoptions. The data were assembled from a variety of sources, 
45 | including academic publications and policy advocacy/information groups. 
46 | Policies were coded according to the Policy Agendas Project major 
47 | topic code. Additional information on policies is available at the source 
48 | repository.
49 | }
50 | \examples{
51 | 
52 | data('policies')
53 | 
54 | # Join the adoption events with the metadata 
55 | merged_policies <- merge(policies, policies_metadata, by = 'policy')
56 | }
57 | \references{
58 | Boehmke, Frederick J.; Mark Brockway; Bruce A. Desmarais; 
59 |     Jeffrey J. Harden; Scott LaCombe; Fridolin Linder; and 
60 |     Hanna Wallach. 2018. "A New Database for Inferring Public Policy 
61 |     Innovativeness and Diffusion Networks." Working paper.
62 | }
63 | \keyword{datasets}
64 | 


--------------------------------------------------------------------------------
/R/summary.R:
--------------------------------------------------------------------------------
 1 | #' Summarize a cascade object
 2 | #' 
 3 | #' Generates summary statistics for single cascades and across cascades in a 
 4 | #' collection, contained in a cascades object.
 5 | #' 
 6 | #' @param object object of class cascade to be summarized.
 7 | #' @param quiet logical, if \code{FALSE} summary stats are printed to std out.
 8 | #' @param ... Additional arguments passed to summary.
 9 | #' @return Prints cascade summary information to the screen
10 | #'     (if \code{quiet = FALSE}). \code{'# cascades'} is the number of cascades in
11 | #'     the object, \code{'# nodes'} is the number of nodes in the system (nodes 
12 | #'     that can theoretically experience an event), \code{'# nodes in cascades'} is 
13 | #'     the number of unique nodes of the system that experienced an event and 
14 | #'     \code{'# possible edges'} is the number of edges that are possible given
15 | #'     the cascade data (see \code{\link{count_possible_edges}} for details.). 
16 | #'     
17 | #'     Additional summaries for each cascade are returned invisibly. 
18 | #      as a \code{data.frame} with columns \code{name} (name of the 
19 | #'     cascade), \code{length} (length of the cascade as an integer of how many
20 | #'     nodes experienced and event) and \code{n_ties} (number of tied event 
21 | #'     times per cascade).
22 | #'     
23 | #' @examples 
24 | #' data(cascades)
25 | #' summary(cascades)
26 | #' 
27 | #' @export
28 | summary.cascade <- function(object, quiet = FALSE, ...) {
29 |     # Cascade info
30 |     casc_lengths <- sapply(object$cascade_nodes, length)
31 |     dups <- sapply(object$cascade_times, count_dups_)
32 |     names(casc_lengths) <- NULL
33 |     casc_names <- names(object$cascade_nodes)
34 |     casc_info <- data.frame("name" = casc_names, "length" = casc_lengths,
35 |                             "n_ties" = dups)
36 |     # Print out summaries 
37 |     if(!quiet) {
38 |         # Calculate all summaries
39 |         n_nodes <- length(object$node_names)
40 |         n_nodes_in_casc <- length(unique(do.call(c, object$cascade_nodes)))
41 |         npe <- count_possible_edges(object)
42 |         pout <- cbind(summary(casc_lengths), summary(dups))
43 |         colnames(pout) <- c("length", "ties")
44 |         cat(paste0('# cascades: ', length(object$cascade_times), '\n'))
45 |         cat(paste0('# nodes: ', n_nodes, '\n'))
46 |         cat(paste0('# nodes in cascades: ', n_nodes_in_casc, '\n'))
47 |         cat(paste0('# possible edges: ', npe, '\n\n'))
48 |         cat("Summary statistics for cascade length and number of ties:\n") 
49 |         print(pout)
50 |     }
51 |      
52 |     return(invisible(casc_info))
53 | }
54 | 
55 | count_dups_ <- function(x) {
56 |     return(length(x) - length(unique(x)))
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/src/netinf.h:
--------------------------------------------------------------------------------
 1 | #include "possible_edges.h"
 2 | 
 3 | using namespace Rcpp;
 4 | 
 5 | /**
 6 |  * For each cascade the edge u -> v is possible check if it improves fit and 
 7 |  * keep track of the ones where it does
 8 |  * 
 9 |  * @param u Integer id of parent node
10 |  * @param v Integer id of child node
11 |  * @param possible_edges edge_map containing data on all possible edges
12 |  * @param cascade_nodes A list of integer vectors containing the node ids of
13 |  *     the cascade in order of infection.
14 |  * @param  cascade_times A list of numeric vectors each containing infection 
15 |  *     times for the corresponding nodes in \code{cascade_ids}.
16 |  * 
17 |  * @return An Rcpp List containing:
18 |  *     [0]: Aggregate improvement from this edge over all trees
19 |  *     [1]: An integer vector of cascades where the edge caused improvement
20 |  *     [2]: The scores of the edge in each of the cascades in [1]
21 |  */
22 | List tree_replacement(int &u, int &v, edge_map &possible_edges,
23 |                        List &cascade_times, List &cascade_nodes,
24 |                        List &trees, std::string &model, NumericVector &params);
25 | 
26 | /**
27 |  * Run the netinf algorithm on a set of nodes and cascades
28 |  * 
29 |  * @param cascade_nodes A list of integer vectors containing the node ids of
30 |  *     the cascade in order of infection.
31 |  * @param  cascade_times A list of numeric vectors each containing infection 
32 |  *     times for the corresponding nodes in \code{cascade_ids}.
33 |  * @param model integer indicating the choice of model: 1: exponential, 
34 |  *     2: power law, 3: rayleigh (only exponential implemented).
35 |  * @param params NumericVector, Parameters for transmission model.
36 |  * @param n_edges Integer, number of edges to infer.
37 |  * @param quiet, Boolean, Should output on progress by suppressed.
38 |  * @param cutoff, p-value cutoff if auto-edges=TRUE
39 |  * 
40 |  * @return List containing one vector per edge.
41 | */
42 | List netinf_(List &cascade_nodes, List &cascade_times, 
43 |              int &n_edges, std::string &model, NumericVector &params, bool quiet, 
44 |              bool auto_edges, double cutoff);
45 | 
46 | /**
47 |  * Update the trees for each cascade using the new edge
48 |  * 
49 |  * @param trees List of trees for each cascade (see spanning_tree.h for 
50 |  *     documentation of the data structure)
51 |  * @param tree_scores Numeric Vector of aggregate likelihood scores for each tree
52 |  * @param replacement_data Return object from tree_replacement.
53 |  * @param cascade_nodes A list of integer vectors containing the node ids of
54 |  *     the cascade in order of infection.
55 |  * @param best_edge node ids of the edge that's updated
56 |  * @param
57 |  */
58 | void update_trees(List &trees, NumericVector &tree_scores, 
59 |                   List &replacement_data, List &cascade_nodes, 
60 |                   std::array<int, 2> best_edge);


--------------------------------------------------------------------------------
/man/netinf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/netinf.R
 3 | \name{netinf}
 4 | \alias{netinf}
 5 | \title{Infer latent diffusion network}
 6 | \usage{
 7 | netinf(cascades, trans_mod = "exponential", n_edges = NULL,
 8 |   p_value_cutoff = NULL, params = NULL, quiet = FALSE,
 9 |   trees = FALSE)
10 | }
11 | \arguments{
12 | \item{cascades}{an object of class cascade containing node and cascade 
13 | information. See \code{\link{as_cascade_long}} and 
14 | \code{\link{as_cascade_wide}} for details.}
15 | 
16 | \item{trans_mod}{character, indicating the choice of model: 
17 | \code{"exponential"}, \code{"rayleigh"} or \code{"log-normal"}.}
18 | 
19 | \item{n_edges}{integer, number of edges to infer. Leave unspecified if using 
20 | \code{p_value_cutoff}.}
21 | 
22 | \item{p_value_cutoff}{numeric, in the interval (0, 1). If 
23 | specified, edges are inferred in each iteration until the Vuong test for 
24 | edge addition reaches the p-value cutoff or when the maximum 
25 | possible number of edges is reached. Leave unspecified if using 
26 | \code{n_edges} to explicitly specify number of edges to infer.}
27 | 
28 | \item{params}{numeric, Parameters for diffusion model. If left unspecified 
29 | reasonable parameters are inferred from the data. See details for how to 
30 | specify parameters for the different distributions.}
31 | 
32 | \item{quiet}{logical, Should output on progress by suppressed.}
33 | 
34 | \item{trees}{logical, Should the inferred cascade trees be returned. Note, 
35 | that this will lead to a different the structure of the function output. 
36 | See section Value for details.}
37 | }
38 | \value{
39 | Returns the inferred diffusion network as an edgelist in an object of 
40 |     class \code{diffnet} and \code{\link[base]{data.frame}}. The first 
41 |     column contains the sender, the second column the receiver node. The 
42 |     third column contains the improvement in fit from adding the edge that is
43 |     represented by the row. The output additionally has the following 
44 |     attributes:
45 |     \itemize{
46 |         \item \code{"diffusion_model"}: The diffusion model used to infer the 
47 |             diffusion network.
48 |         \item \code{"diffusion_model_parameters"}: The parameters for the 
49 |             model that have been inferred by the approximate profile MLE 
50 |             procedure.
51 |     }
52 |     If the argument \code{trees} is set to \code{TRUE}, the output is a list
53 |     with the first element being the \code{data.frame} described above, and 
54 |     the second element being the trees in edge-list form in a single 
55 |     \code{data.frame}.
56 | }
57 | \description{
58 | Infer a network of diffusion ties from a set of cascades. Each cascade 
59 | is defined by pairs of node ids and infection times.
60 | }
61 | \details{
62 | The algorithm is describe in detail in Gomez-Rodriguez et al. (2010). 
63 | Additional information can be found on the 
64 | netinf website (\url{http://snap.stanford.edu/netinf/}).
65 | 
66 | \itemize{
67 |     \item Exponential distribution: \code{trans_mod = "exponential"},
68 |     \code{params = c(lambda)}. 
69 |     Parametrization: \eqn{\lambda e^{-\lambda x}}.
70 |     \item Rayleigh distribution: \code{trans_mod = "rayleigh"},
71 |     \code{params = c(alpha)}. 
72 |     Parametrization: \eqn{\frac{x}{\alpha^2} \frac{e^{-x^2}}{2\alpha^2}}.
73 |     \item Log-normal distribution: \code{trans_mod = "log-normal"},
74 |     \code{params = c(mu, sigma)}. 
75 |     Parametrization: \eqn{\frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(ln x - \mu)^2}{2\sigma^2}}}.
76 | }
77 | 
78 | If higher performance is required and for very large data sets, a faster pure C++ 
79 | implementation is available in the Stanford Network Analysis Project (SNAP). 
80 | The software can be downloaded at \url{http://snap.stanford.edu/netinf/}.
81 | }
82 | \examples{
83 | 
84 | # Data already in cascades format:
85 | data(cascades)
86 | out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1)
87 | 
88 | # Starting with a dataframe
89 | df <- simulate_rnd_cascades(10, n_nodes = 20)
90 | cascades2 <- as_cascade_long(df, node_names = unique(df$node_name))
91 | out <- netinf(cascades2, trans_mod = "exponential", n_edges = 5, params = 1)
92 | 
93 | }
94 | \references{
95 | M. Gomez-Rodriguez, J. Leskovec, A. Krause. Inferring Networks of Diffusion 
96 | and Influence.The 16th ACM SIGKDD Conference on Knowledge Discovery and 
97 | Data Mining (KDD), 2010.
98 | }
99 | 


--------------------------------------------------------------------------------
/src/spanning_tree.cpp:
--------------------------------------------------------------------------------
  1 | #include <Rcpp.h>
  2 | #include "distributions.h"
  3 | 
  4 | using namespace Rcpp;
  5 | 
  6 | double edge_score(double &event_time_i, double &event_time_j, std::string &model,
  7 |                   NumericVector &params, bool tied) {
  8 |     double x = event_time_j - event_time_i;
  9 |     double y;
 10 |     double out;
 11 |     if (model == "exponential") {
 12 |         y = dexp_(x, params[0]);
 13 |     } else if (model == "rayleigh") {
 14 |         y = drayleigh_(x, params[0]);
 15 |     } else if (model == "log-normal") {
 16 |         y = dlognormal_(x, params[0], params[1]);
 17 |     } 
 18 |     if (tied) {
 19 |         out = log(0.5 * y);
 20 |     } else {
 21 |         out = log(0.0000000001 * y);
 22 |     }
 23 |     return out;
 24 | }
 25 | 
 26 | List optimal_spanning_tree(IntegerVector &cascade_nodes, 
 27 |                            NumericVector &cascade_times, std::string &model,
 28 |                            NumericVector &params) {
 29 |  
 30 |     // Init containers for the results
 31 |     int cascade_size = cascade_nodes.size();
 32 |     NumericVector parent_scores(cascade_size);
 33 |     IntegerVector parent_ids(cascade_size);
 34 |     
 35 |     // For each node involved in this cascade find the parent and the weight for
 36 |     // the respective edge 
 37 |     double tree_score = 0;
 38 |     for(int i = 0; i < cascade_size; i++) {
 39 |         // Only nodes that have an earlier event time can be parents for current
 40 |         // node
 41 |         IntegerVector possible_parents;
 42 |         NumericVector parent_times;
 43 |         for(int j = 0; j < cascade_size; j++) {
 44 |             if (cascade_times[j] < cascade_times[i]) {
 45 |                 possible_parents.push_back(cascade_nodes[j]);
 46 |                 parent_times.push_back(cascade_times[j]);
 47 |             } 
 48 |         }
 49 |         // Find the parent with the highest score if there are possible parents
 50 |         int n_parents = possible_parents.size();
 51 |         // If there are multiple potential parents find the one that gives the e
 52 |         // edge the maximum weight
 53 |         if (n_parents > 0) {
 54 |             double max_parent_score = -INFINITY;
 55 |             int parent;
 56 |             double score;
 57 |             for (int k = 0; k < n_parents; k++) {
 58 |                 score = edge_score(parent_times[k], cascade_times[i], model,
 59 |                                    params, false);
 60 |                 if (score > max_parent_score) {
 61 |                     max_parent_score = score;
 62 |                     parent = possible_parents[k];
 63 |                 }
 64 |             }
 65 |             // Select the parent with the max score and store the score
 66 |             if(max_parent_score == -INFINITY) {
 67 |                 std::string msg = "Observed time with zero likelihood. Consider adjusting lambda.\n";
 68 |                 throw std::invalid_argument(msg);
 69 |             }
 70 |             parent_ids[i] = parent;
 71 |             parent_scores[i] = max_parent_score;
 72 |             tree_score += max_parent_score;
 73 |             
 74 |         // If node can't have parent (fist node in cascade or tied first nodes) 
 75 |         // set parent id and score to NA
 76 |         } else {
 77 |             parent_ids[i] = -1;
 78 |             parent_scores[i] = NA_REAL;
 79 |         }
 80 |     }
 81 |     List out = List::create(parent_ids, parent_scores, tree_score); 
 82 |     return out; 
 83 | }
 84 | 
 85 | List initialize_trees(List &cascade_nodes, List &cascade_times, 
 86 |                       NumericVector &params, std::string &model) {
 87 |     
 88 |     // Output container
 89 |     int n_cascades = cascade_nodes.size();
 90 |     List out(n_cascades);
 91 |     NumericVector tree_scores(n_cascades, NA_REAL);
 92 |     
 93 |     // Calculate optimal spanning tree for each cascade
 94 |     for(int i = 0; i < n_cascades; i++) {
 95 |         checkUserInterrupt();
 96 |         IntegerVector this_cascade_ids = cascade_nodes[i];
 97 |         NumericVector this_cascade_times = cascade_times[i];
 98 |         List tree_result = optimal_spanning_tree(this_cascade_ids, 
 99 |                                                  this_cascade_times, model, 
100 |                                                  params);
101 |         tree_scores[i] = tree_result[2];
102 |         out[i] = tree_result;
103 |     }
104 |     return List::create(out, tree_scores);
105 | }
106 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
  1 | # NetworkInference 1.2.5.9000
  2 | 
  3 | Development Version
  4 | 
  5 | ## New Features
  6 | 
  7 | ## Bug Fixes
  8 | 
  9 | # NetworkInference 1.2.5
 10 | 
 11 | * Update C++11 -> C++17
 12 | * Minor fixes to package metadata
 13 | 
 14 | # NetworkInference 1.2.4
 15 | 
 16 | ## New Features
 17 | 
 18 | ## Bug Fixes
 19 | * Fixed potential memory error from iterating over the beginning/end of a `std::map`. The last fix didn't work under all circumstances.
 20 | 
 21 | # NetworkInference 1.2.3
 22 | 
 23 | ## New Features
 24 | 
 25 | ## Bug Fixes
 26 | * Fixed potential memory error from iterating over the beginning/end of a `std::map`
 27 | 
 28 | # NetworkInference 1.2.2
 29 | 
 30 | ## New Features
 31 | 
 32 | ## Bug Fixes
 33 | * `netinf()` with `log-normal` model didn't run because of an index error in the argument check for `params`
 34 | * Fixed memory allocation error caused by uninitialized comparison
 35 | 
 36 | # NetworkInference 1.2.1
 37 | 
 38 | ## Bug Fixes
 39 | 
 40 | * `netinf_` used ceiling on integer which caused error on Solaris
 41 | 
 42 | # NetworkInference 1.2.0 
 43 | 
 44 | 
 45 | ## New Features
 46 | 
 47 | #### Changes to `netinf()`
 48 | 
 49 | * `netinf()` got another **speed-up**. After the first edge, the computation 
 50 |     time for each edge is reduced by the factor number of nodes in the network
 51 | * Number of edges can now be chosen using a **Vuong style test**. If this 
 52 |     procedure should be used, a p-value is chosen at which the inference of new
 53 |     edges stops. This value is specified via the new `p_value_cutoff` argument
 54 |     to `netinf()`.
 55 | * This lead to the netinf output having a **fourth column** now, containing the 
 56 | p-value for each edge. The p-value is also available if a fixed number of edges
 57 | is chosen.
 58 | * If no starting values are provided via the `params` argument parameters
 59 |     are initialized by choosing the midpoint between the maximum possible
 60 |     parameter value and the minimum possible value. These values are derived
 61 |     using the closed form MLE of the respective parameter, derived from
 62 |     either the minimum possible diffusion times (assuming a diffusion
 63 |     'chain', i.e. `a -> b -> c -> ...`) or the maximum possible diffusion
 64 |     times (assuming a diffusion 'fan', i.e. `a -> b, a -> c, a -> d,...`).
 65 | * `n_edges` can now specify either an absolute number of edges, or a p-value
 66 |     cutoff in the interval `(0, 1)` for the Vuong test
 67 | * The log normal distribution is now available as a diffusion model. With this 
 68 |     comes a **change in the arguments** for `netinf`. Instead of `lambda`, 
 69 |     parameters are now specified with a vector (or scalar depending on 
 70 |     distribution) `params`. For exponential and rayleigh distributions `params` 
 71 |     is just the rate / alpha parameter. For the log-normal distribution `params` 
 72 |     specifies mean and variance (in that order). See the `netinf()` 
 73 |     documentation for details on specificaiton and parametrization (`?netinf`).
 74 | * The output from `netinf()` now contains information on the model, parameters 
 75 |     and iterations as attributes. See the documentation for details.
 76 | * The `policies` dataset has been updated with over 600 new policies from the 
 77 |     [SPID](https://doi.org/10.7910/DVN/CVYSR7) database 
 78 |     (access via `data(policies)`).
 79 | * Inferred cascade trees can now be returned by setting `trees = TRUE`.
 80 | 
 81 | #### New functions
 82 | * New function `drop_nodes()` now allows to drop nodes from all cascades in a cascade object.
 83 | 
 84 | #### Changes to `simulate_cascades()`
 85 | * `simulate_cascades()` now supports passing of additional (isolated in the diffusion network) nodes via the `nodes` argument.
 86 | * `simulate_cascades()` now also supports the log-normal distribution.
 87 | 
 88 | 
 89 | ## Bug Fixes
 90 | 
 91 | * Inference of very uninformative edges (large number of edges) could lead for the software to break. Fixed now 
 92 | * In `simulate_cascades()` with partial cascades provided, it was possible that nodes experienced an event earlier than the last event in the partial cascade. Now, the earliest event time is the last observed event time in the partial cascade.
 93 | 
 94 | ## Other changes
 95 | 
 96 | * C++ code is now modularized and headers are properly documented
 97 | 
 98 | 
 99 | # NetworkInference 1.1.2
100 | 
101 | 
102 | ## New Features
103 | 
104 | * We made changes to the internal data structures of the netinf function, so it is much faster and memory efficient now.
105 | * `netinf()` now has a shiny progress bar!
106 | * `as.cascade` is now completely removed (see release note on version 1.1.0).
107 | * New convenience function to subset cascades by time (`subset_cascade_time`) and by cascade id (`subset_cascade`).
108 | 
109 | ## Bug Fixes
110 | * Long running functions (that call compiled code) can now be interrupted without crashing the R session.
111 | * `as_cascade_long()` and `as_cascade_wide()` handle date input correctly now.
112 | * `as_cascade_wide()` couldn't handle data input of class `data.table`. 
113 | 
114 | 
115 | # NetworkInference 1.1.1
116 | 
117 | 
118 | ## Bug Fixes
119 | 
120 | * Use of igraph now conditional compliant with Writing R Extensions [1.1.3.1](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Suggested-packages)
121 | * Fixed version number displayed in startup message
122 | 
123 | 
124 | # NetworkInference 1.1.0
125 | 
126 | ## New Features
127 | 
128 | * Data format (long or wide) of `as.cascade` is not bound to the class of the data object anymore. In 1.0.0 wide format had to be a matrix and long format had to be a dataframe. This did not make much sense. `as.cascade` is now deprecated and replaced by two new functions `as_cascade_long` and `as_cascade_wide`.
129 | 
130 | ## Bug Fixes
131 | 
132 | * x and y axis labels in `plot.cascade` with option `label_nodes=FALSE` were
133 |     reversed
134 | 
135 | 
136 | # NetworkInference 1.0.0
137 | 
138 | First release
139 | 


--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
  1 | #' US State Policy Adoption (SPID)
  2 | #'
  3 | #' The SPID data includes information on the year of adoption for over 700 
  4 | #' policies in the American states. 
  5 | #' 
  6 | #' This version 1.0 of the database. For each policy we document the year of first 
  7 | #' adoption for each state. Adoption dates range from 1691 to 2017 and includes 
  8 | #' all fifty states. Policies are adopted by anywhere from 1 to 50 states, with 
  9 | #' an average of 24 adoptions. The data were assembled from a variety of sources, 
 10 | #' including academic publications and policy advocacy/information groups. 
 11 | #' Policies were coded according to the Policy Agendas Project major 
 12 | #' topic code. Additional information on policies is available at the source 
 13 | #' repository.
 14 | #' 
 15 | #' @name policies
 16 | #' 
 17 | #' @usage data(policies)  
 18 | #' @docType data
 19 | #' 
 20 | #' @format The data comes in two objects of class \code{data.frame}. The first
 21 | #'     object, named \code{policies} contains the adoption events. Each row 
 22 | #'     corresponds to an adoption event. Each adoption event is described by 
 23 | #'     the three columns: 
 24 | #' \itemize{
 25 | #'     \item \code{statenam}: Name of the adopting state.
 26 | #'     \item \code{policy}: Name of the policy.
 27 | #'     \item \code{adopt_year}: Year when the state adopted the policy.
 28 | #' }
 29 | #' The second object (\code{policies_metadata}) contains more details on each
 30 | #' of the policies. It contains these columns:
 31 | #' \itemize{
 32 | #'     \item \code{policy}: Name of the policy.
 33 | #'     \item \code{source}: Original source of the data.
 34 | #'     \item \code{first_year}: First year any state adopted this policy.
 35 | #'     \item \code{last_year}: Last year any state adopted this policy.
 36 | #'     \item \code{adopt_count}: Number of states that adopted this policy.
 37 | #'     \item \code{description}: Description of the policy.
 38 | #'     \item \code{majortopic}: Topic group the policy belongs to.
 39 | #' } 
 40 | #' Both \code{data.frame} objects can be joined (merged) on the common column
 41 | #' \code{policy} (see example code).
 42 | #'     
 43 | #' @source \url{https://doi.org/10.7910/DVN/CVYSR7}
 44 | #' 
 45 | #' @aliases policies policies_metadata
 46 | #' 
 47 | #' @references Boehmke, Frederick J.; Mark Brockway; Bruce A. Desmarais; 
 48 | #'     Jeffrey J. Harden; Scott LaCombe; Fridolin Linder; and 
 49 | #'     Hanna Wallach. 2018. "A New Database for Inferring Public Policy 
 50 | #'     Innovativeness and Diffusion Networks." Working paper.
 51 | #'     
 52 | #' @examples
 53 | #' 
 54 | #' data('policies')
 55 | #' 
 56 | #' # Join the adoption events with the metadata 
 57 | #' merged_policies <- merge(policies, policies_metadata, by = 'policy')
 58 | "policies"
 59 | 
 60 | #' Example cascades
 61 | #'
 62 | #' An example dataset of 31 nodes and 54 cascades. From the original netinf 
 63 | #' implementation in SNAP. 
 64 | #' 
 65 | #' @name cascades
 66 | #' 
 67 | #' @usage data(cascades)  
 68 | #' @docType data
 69 | #' 
 70 | #' @format An object of class \code{cascade} containing 4 objects
 71 | #' \describe{
 72 | #'   \item{node_names}{Character node names}
 73 | #'   \item{cascade_nodes}{A list of integer vectors. Each containing the names of the
 74 | #'       nodes infected in this cascades in the order of infection}
 75 | #'   \item{cascade_times}{A list of numeric vectors. Each containing the infection
 76 | #'       times for the corresponding nodes in cascade_nodes}
 77 | #' }
 78 | #' @source \url{https://github.com/snap-stanford/snap/blob/master/examples/netinf/example-cascades.txt}
 79 | "cascades"
 80 | 
 81 | 
 82 | 
 83 | #' Validation output from netinf source.
 84 | #' 
 85 | #' Contains output from original netinf C++ implementation, executed on 
 86 | #' \code{\link{cascades}}. For testing purposes.
 87 | #' 
 88 | #' @name validation
 89 | #' 
 90 | #' @usage data(validation)  
 91 | #' @docType data
 92 | #' 
 93 | #' @format An object of class \code{data.frame} with 6 columns, containing:
 94 | #' \describe{
 95 | #'   \item{origin_node}{Origin of diffusion edge.}
 96 | #'   \item{destination_node}{Destination node of diffusion edge.}
 97 | #'   \item{volume}{??}
 98 | #'   \item{marginal_gain}{Marginal gain from edge.}
 99 | #'   \item{median_time_difference}{Median time between events in origin and 
100 | #'       destination}
101 | #'   \item{mean_time_difference}{Mean time between events in origin and 
102 | #'       destination}
103 | #' }
104 | #' 
105 | #' @source Output from netinf example program (\url{https://github.com/snap-stanford/snap/tree/master/examples/netinf}).
106 | "validation"
107 | 
108 | # # Code to generate validation dataset. Data is copied from netinf source example
109 | # # ouput
110 | #
111 | # dat <- c(23,0,21,3049.810355,1.543899,2.136381,
112 | # 9,5,21,3049.291245,1.574027,2.161101,
113 | # 0,31,20,2905.326401,1.889040,2.099126,
114 | # 5,14,16,2326.861114,1.732281,1.936626,
115 | # 5,3,15,2181.930785,1.229308,1.903394)
116 | #
117 | # validation <- data.frame(matrix(dat, nc = 6, byrow = TRUE))
118 | # validation <- validation[order(validation[, 1], validation[, 2]), ]
119 | # rownames(validation) <- c(1:nrow(validation))
120 | # validation[, 1] <- as.character(validation[, 1])
121 | # validation[, 2] <- as.character(validation[, 2])
122 | # colnames(validation) <- c("origin_node", "destination_node", "volume", 
123 | #                          "marginal_gain", "median_time_difference",
124 | #                          "mean_time_difference")
125 | # save(validation, file = 'data/validation.RData')
126 | 
127 | 
128 | #' Larger simulated validation network.
129 | #' 
130 | #' A network from simulated data. For testing purposes.
131 | #' 
132 | #' @name sim_validation
133 | #' 
134 | #' @usage data(sim_validation)  
135 | #' @docType data
136 | #' 
137 | #' @format An object of class \code{data.frame} with 4 columns, containing:
138 | #' \describe{
139 | #'   \item{origin_node}{Origin of diffusion edge.}
140 | #'   \item{destination_node}{Destination node of diffusion edge.}
141 | #'   \item{improvement}{Improvement in score for the edge}
142 | #'   \item{p-value}{p-value for vuong test}
143 | #' }
144 | #' 
145 | #' @source See code below.
146 | "sim_validation"
147 | 
148 | # # Code to generate validation dataset.
149 | # set.seed(142857)
150 | # df <- simulate_rnd_cascades(50, 50)
151 | # cascades <- as_cascade_long(df)
152 | # network <- netinf(cascades, n_edges = 0.05)
153 | # sim_validation <- list("input" = cascades, "output" = network)
154 | # save(sim_validation, file = 'data/sim_validation.RData')
155 |  


--------------------------------------------------------------------------------
/R/plot.R:
--------------------------------------------------------------------------------
  1 | # Common plotting elements
  2 | # 
  3 | # Plotting layout for NetworkInference package.
  4 | # 
  5 | # @param mode What elements to return.
  6 | # 
  7 | # @return A ggplot object that can be added to a ggplot plot 
  8 | PLOT_THEME_ <- function(mode = NULL) {
  9 |     if(is.null(mode)) {
 10 |         out <- theme_bw() 
 11 |     } else if(mode == "color") {
 12 |         out <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
 13 |                  "#0072B2", "#D55E00", "#CC79A7")
 14 |     }
 15 |     return(out)
 16 | }
 17 | 
 18 | #' Plot a cascade object
 19 | #' 
 20 | #' Allows plotting of one or multiple, labeled or unlabeled cascades.
 21 | #' 
 22 | #' The function returns a ggplot plot object (class \code{gg, ggplot}) which 
 23 | #' can be modified like any other ggplot. See the ggplot documentation and the 
 24 | #' examples below for more details.
 25 | #' 
 26 | #' @import ggplot2
 27 | #' @import ggrepel
 28 | #' @importFrom stats density
 29 | #' 
 30 | #' @param x object of class cascade to be plotted.
 31 | #' @param label_nodes logical, indicating if should the nodes in each cascade be 
 32 | #'     labeled. If the cascades are very dense setting this to \code{FALSE} is
 33 | #'     recommended.
 34 | #' @param selection a vector of cascade ids to plot.
 35 | #' @param ... additional arguments passed to plot.
 36 | #' 
 37 | #' @examples 
 38 | #' 
 39 | #' data(cascades)
 40 | #' plot(cascades, selection = names(cascades$cascade_nodes)[1:5])
 41 | #' plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20))
 42 | #' 
 43 | #' # Modify resulting ggplot object
 44 | #' library(ggplot2) 
 45 | #' p <- plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20))
 46 | #' ## Add a title
 47 | #' p <- p + ggtitle('Your Title')
 48 | #' p
 49 | #' ## Change Axis
 50 | #' p <- p + xlab("Your modified y axis label") #x and y labels are flipped here
 51 | #' p <- p + ylab("Your modified x axis label") #x and y labels are flipped here
 52 | #' p
 53 | #' 
 54 | #' @return A ggplot plot object.
 55 | #' @export
 56 | plot.cascade <- function(x, label_nodes = TRUE, selection = NULL, ...) {
 57 |     
 58 |     # Check inputs
 59 |     assert_that(inherits(x, "cascade"))
 60 |     assert_that(inherits(label_nodes, "logical"))
 61 |     pdat <- as.data.frame(x)
 62 |     
 63 |     # Select cascades
 64 |     if(!is.null(selection)) {
 65 |         # Check selection input
 66 |         assert_that(length(selection) >= 1) 
 67 |         assert_that(is.element(class(selection), c("character", "numeric", 
 68 |                                                    "integer", "factor")))
 69 |         chk <- is.element(selection, unique(pdat$cascade_id))
 70 |         if(!all(chk)) {
 71 |             msg <- paste("The following cascade id(s) provided in `selection` do",
 72 |                          "not exist in the cascade object:", 
 73 |                          paste0(selection[!chk], collapse = ","), "\n")
 74 |             stop(msg) 
 75 |         }
 76 |         selection <- as.character(selection)  
 77 |         # Slice data
 78 |         sel <- is.element(pdat$cascade_id, selection)
 79 |         pdat <- pdat[sel, ]
 80 |     }
 81 |     
 82 |     pdat$cascade_id <- as.factor(pdat$cascade_id)
 83 |     
 84 |     if(length(unique(pdat$cascade_id)) > 20 & label_nodes) {
 85 |         msg <- paste("Plotting more than 20 cascades with labels is not",
 86 |                      "recommended. Set label_nodes to FALSE or choose a subset",
 87 |                      "of cascades using the `selection` argument\n")
 88 |         warning(msg)
 89 |     }
 90 |     
 91 |     # Plot
 92 |     palette <-  PLOT_THEME_(mode = "color")
 93 |     
 94 |     ## Labeled Plot
 95 |     if(label_nodes) {
 96 |         p <- ggplot() +
 97 |             geom_line(aes_string(x = "event_time", y = "cascade_id"), 
 98 |                       color = "grey", linetype = 2, data = pdat) +
 99 |             geom_label_repel(aes_string(label = "node_name", color = "node_name", 
100 |                                         x = "event_time", y = "cascade_id"), 
101 |                              size = 2.5, data = pdat) +
102 |             ylab("Cascade ID") + xlab("Time") +
103 |             scale_color_discrete(guide = FALSE)
104 |     ## Unlabeled plot
105 |     } else {
106 |         p <- ggplot(pdat, aes_string(x = "cascade_id", y = "event_time")) +
107 |             geom_violin() + 
108 |             geom_jitter(height = 0, width = 0.05, alpha = 0.6, size = 0.5) +
109 |             xlab("Cascade ID") + ylab("Time") +
110 |             coord_flip()
111 |     }
112 | 
113 |     ## Layout
114 |     p <- p + 
115 |         PLOT_THEME_()
116 |     return(p)
117 | }
118 | 
119 | #' Visualize netinf output
120 | #' 
121 | #' Visualize the inferred diffusion network or the marginal gain in fit obtained
122 | #' by addition of each edge.
123 | #' 
124 | #' If `type = improvement` a ggplot object is returned. It can be modified like
125 | #' any other ggplot. See the ggplot documentation and the examples in 
126 | #' \link{plot.cascade}.
127 | #' 
128 | #' @import ggplot2
129 | #' 
130 | #' @param x object of class diffnet to be plotted.
131 | #' @param type character, one of \code{c("network", "improvement", "p-value")} 
132 | #'     indicating if the inferred diffusion network, the 
133 | #'     improvement for each edge or the p-value from the vuong test for each
134 | #'     edge should be visualized .
135 | #' @param ... additional arguments.
136 | #' 
137 | #' @examples 
138 | #'
139 | #' \dontrun{
140 | #'  data(cascades)
141 | #'  res <- netinf(cascades, quiet = TRUE)
142 | #'  plot(res, type = "network")
143 | #'  plot(res, type = "improvement")
144 | #'  plot(res, type = "p-value")
145 | #' }
146 | #' 
147 | #' @return A ggplot plot object if \code{type = "improvement"} otherwise an 
148 | #'     igraph plot.
149 | #' @export
150 | plot.diffnet <- function(x, type = "network", ...) {
151 |     # Check inputs
152 |     type <- match.arg(type, c("network", "improvement", "p-value"))
153 |     
154 |     if(type == "network") {
155 |         if (requireNamespace("igraph", quietly = TRUE)) {
156 |             # Plot network
157 |             g <- igraph::graph_from_data_frame(d = x[, 1:2])
158 |             igraph::plot.igraph(g, edge.arrow.size=.3, vertex.color = "grey70")
159 |         } else {
160 |             stop("In order to use this functionality the `igraph` package needs to be installed. Run `install.packages('igraph')` and retry.")
161 |         } 
162 |     }
163 |     else if(type == "improvement") {
164 |         ggplot(x) + 
165 |             geom_line(aes_string(x=c(1:nrow(x)), y = "improvement"), 
166 |                       color = "grey80", size = 0.5) +
167 |             geom_point(aes_string(x=c(1:nrow(x)), y = "improvement"), 
168 |                        size = 0.5) + 
169 |             xlab("Edge Number") + ylab("Improvement") +
170 |             PLOT_THEME_()
171 |     } else {
172 |         ggplot(x) + 
173 |             geom_line(aes_string(x=c(1:nrow(x)), y = "p_value"), 
174 |                       color = "grey80", size = 0.5) +
175 |             geom_point(aes_string(x=c(1:nrow(x)), y = "p_value"), 
176 |                        size = 0.5) + 
177 |             xlab("Edge Number") + ylab("P-Value") +
178 |             PLOT_THEME_()
179 |     }
180 | }
181 | 


--------------------------------------------------------------------------------
/R/netinf.R:
--------------------------------------------------------------------------------
  1 | #' Infer latent diffusion network
  2 | #' 
  3 | #' Infer a network of diffusion ties from a set of cascades. Each cascade 
  4 | #' is defined by pairs of node ids and infection times. 
  5 | #' 
  6 | #' The algorithm is describe in detail in Gomez-Rodriguez et al. (2010). 
  7 | #' Additional information can be found on the 
  8 | #' netinf website (\url{http://snap.stanford.edu/netinf/}).
  9 | #' 
 10 | #' \itemize{
 11 | #'     \item Exponential distribution: \code{trans_mod = "exponential"},
 12 | #'     \code{params = c(lambda)}. 
 13 | #'     Parametrization: \eqn{\lambda e^{-\lambda x}}.
 14 | #'     \item Rayleigh distribution: \code{trans_mod = "rayleigh"},
 15 | #'     \code{params = c(alpha)}. 
 16 | #'     Parametrization: \eqn{\frac{x}{\alpha^2} \frac{e^{-x^2}}{2\alpha^2}}.
 17 | #'     \item Log-normal distribution: \code{trans_mod = "log-normal"},
 18 | #'     \code{params = c(mu, sigma)}. 
 19 | #'     Parametrization: \eqn{\frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(ln x - \mu)^2}{2\sigma^2}}}.
 20 | #' }
 21 | #' 
 22 | #' If higher performance is required and for very large data sets, a faster pure C++ 
 23 | #' implementation is available in the Stanford Network Analysis Project (SNAP). 
 24 | #' The software can be downloaded at \url{http://snap.stanford.edu/netinf/}.
 25 | #' 
 26 | #' @import checkmate
 27 | #' @import assertthat
 28 | #' 
 29 | #' @param  cascades an object of class cascade containing node and cascade 
 30 | #'     information. See \code{\link{as_cascade_long}} and 
 31 | #'     \code{\link{as_cascade_wide}} for details. 
 32 | #' @param trans_mod character, indicating the choice of model: 
 33 | #'      \code{"exponential"}, \code{"rayleigh"} or \code{"log-normal"}.
 34 | #' @param params numeric, Parameters for diffusion model. If left unspecified 
 35 | #'     reasonable parameters are inferred from the data. See details for how to 
 36 | #'     specify parameters for the different distributions.
 37 | #' @param n_edges integer, number of edges to infer. Leave unspecified if using 
 38 | #'     \code{p_value_cutoff}.
 39 | #' @param p_value_cutoff numeric, in the interval (0, 1). If 
 40 | #'     specified, edges are inferred in each iteration until the Vuong test for 
 41 | #'     edge addition reaches the p-value cutoff or when the maximum 
 42 | #'     possible number of edges is reached. Leave unspecified if using 
 43 | #'     \code{n_edges} to explicitly specify number of edges to infer.
 44 | #' @param quiet logical, Should output on progress by suppressed.
 45 | #' @param trees logical, Should the inferred cascade trees be returned. Note, 
 46 | #'     that this will lead to a different the structure of the function output. 
 47 | #'     See section Value for details.
 48 | #' 
 49 | #' @return Returns the inferred diffusion network as an edgelist in an object of 
 50 | #'     class \code{diffnet} and \code{\link[base]{data.frame}}. The first 
 51 | #'     column contains the sender, the second column the receiver node. The 
 52 | #'     third column contains the improvement in fit from adding the edge that is
 53 | #'     represented by the row. The output additionally has the following 
 54 | #'     attributes:
 55 | #'     \itemize{
 56 | #'         \item \code{"diffusion_model"}: The diffusion model used to infer the 
 57 | #'             diffusion network.
 58 | #'         \item \code{"diffusion_model_parameters"}: The parameters for the 
 59 | #'             model that have been inferred by the approximate profile MLE 
 60 | #'             procedure.
 61 | #'     }
 62 | #'     If the argument \code{trees} is set to \code{TRUE}, the output is a list
 63 | #'     with the first element being the \code{data.frame} described above, and 
 64 | #'     the second element being the trees in edge-list form in a single 
 65 | #'     \code{data.frame}.
 66 | #'  
 67 | #' @references 
 68 | #' M. Gomez-Rodriguez, J. Leskovec, A. Krause. Inferring Networks of Diffusion 
 69 | #' and Influence.The 16th ACM SIGKDD Conference on Knowledge Discovery and 
 70 | #' Data Mining (KDD), 2010.
 71 | #'      
 72 | #' @examples 
 73 | #' 
 74 | #' # Data already in cascades format:
 75 | #' data(cascades)
 76 | #' out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1)
 77 | #' 
 78 | #' # Starting with a dataframe
 79 | #' df <- simulate_rnd_cascades(10, n_nodes = 20)
 80 | #' cascades2 <- as_cascade_long(df, node_names = unique(df$node_name))
 81 | #' out <- netinf(cascades2, trans_mod = "exponential", n_edges = 5, params = 1)
 82 | #' 
 83 | #' @export
 84 | netinf <- function(cascades, trans_mod = "exponential", n_edges = NULL, 
 85 |                    p_value_cutoff = NULL, params = NULL, quiet = FALSE, 
 86 |                    trees = FALSE) {
 87 |     
 88 |     # Check inputs 
 89 |     assert_that(class(cascades)[1] == "cascade")
 90 |     qassert(trans_mod, "S1")
 91 |     if(is.null(n_edges) & is.null(p_value_cutoff)) {
 92 |         stop('Please specify either `n_edges` or `p_value_cutoff`.')
 93 |     }
 94 |     if(!is.null(n_edges) & !is.null(p_value_cutoff)) {
 95 |         stop('Please only specify either `n_edges` or `p_value_cutoff`.')
 96 |     }
 97 |     if(!is.null(n_edges)) {
 98 |         qassert(n_edges, "X1[1,)")
 99 |         auto_edges <- FALSE 
100 |         cutoff <- 0 # Not used
101 |     } else {
102 |         qassert(p_value_cutoff, "R1(0,1]")
103 |         auto_edges <- TRUE
104 |         cutoff <- p_value_cutoff
105 |         n_edges <- 0 # Not used since n_edges inferred form cutoff
106 |     } 
107 |     
108 |     model <- match.arg(trans_mod, c("exponential", "rayleigh", 'log-normal'))
109 |     
110 |     if(!is.null(params)) {
111 |         if(model == "exponential" | model == "rayleigh") {
112 |             qassert(params, "N1")
113 |         } else if(model == "log-normal") {
114 |             qassert(params[1], "N1")
115 |             qassert(params[2], "N1(0,)")
116 |         }
117 |     }
118 |     
119 |     # Assign integer node ids
120 |     # Note that the ids start at 0 (c++ is 0 indexed)
121 |     node_ids <- c(0:(length(cascades$node_names) - 1))
122 |     names(node_ids) <- cascades$node_names
123 |     
124 |     # Transform node ids in cascades to integer ids
125 |     cascade_nodes <- lapply(cascades$cascade_nodes, function(x) node_ids[x]) 
126 |     
127 |     # Initialize parameters    
128 |     if(is.null(params)) {
129 |         max_times <- do.call(c, lapply(cascades$cascade_times, 
130 |                                        function(x) x[-1] - x[1]))
131 |         min_times <- do.call(c, lapply(cascades$cascade_times, 
132 |                             function(x) x[-1] - x[-length(x)]))
133 |         if(model == "exponential") {
134 |             lambda_min <- 1 / mean(max_times, na.rm = T)
135 |             lambda_max <- 1 / mean(min_times, na.rm = T)
136 |             params <- mean(c(lambda_max, lambda_min))           
137 |         } else if(model == "rayleigh") {
138 |             N <- length(max_times)
139 |             sh_max <- sqrt(sum(max_times^2) / 2 * N)
140 |             sh_min <- sqrt(sum(min_times^2) / 2 * N)
141 |             adjustment <- exp(lgamma(N) + log(sqrt(N))) / exp(lgamma(N + 1 / 2))
142 |             params <- mean(c(sh_max * adjustment, sh_min * adjustment))
143 |         } else if(model == "log-normal") {
144 |             mean_max <- mean(log(max_times[max_times != 0]))
145 |             mean_min <- mean(log(min_times[min_times != 0]))
146 |             sigma_max <- sqrt(stats::var(log(max_times[max_times != 0])))
147 |             sigma_min <- sqrt(stats::var(log(min_times[min_times != 0])))
148 |             params <- c(mean(mean_max, mean_min), mean(sigma_max, sigma_min))
149 |         }
150 |         qassert(params, "R")
151 |         if(!quiet) cat('Initialized parameters with: ', params, '\n')
152 |     }
153 |     
154 |     # Run netinf 
155 |     netinf_out <- netinf_(cascade_nodes = cascade_nodes, 
156 |                           cascade_times = cascades$cascade_times, 
157 |                           model = model, params = params, n_edges = n_edges, 
158 |                           quiet = quiet, auto_edges = auto_edges, 
159 |                           cutoff = cutoff)
160 |        
161 | 
162 |     network <- as.data.frame(cbind(do.call(rbind, netinf_out[[1]]), 
163 |                              netinf_out[[2]]), stringsAsFactors = FALSE)
164 | 
165 |   
166 |     ## Replace integer node_ids with node_names
167 |     ### In the edgelist
168 |     network[, 1] <- cascades$node_names[(network[, 1] + 1)]
169 |     network[, 2] <- cascades$node_names[(network[, 2] + 1)]
170 |     colnames(network) <- c("origin_node", "destination_node", "improvement")
171 |     network$p_value <- netinf_out[[4]]
172 |     class(network) <- c("diffnet", "data.frame")
173 |     attr(network, "diffusion_model") = model
174 |     attr(network, "diffusion_model_parameters") = params
175 |     
176 |     if(trees) {
177 |         tree_dfs <- lapply(1:length(netinf_out[[3]]), function(i) {
178 |             x <- netinf_out[[3]][[i]]
179 |             out <- as.data.frame(cbind(x[[1]], x[[2]], rep(i, length(x[[1]]))))}
180 |             )
181 |         trees_df <- do.call(rbind, tree_dfs)
182 |         
183 |         # Replace int node ids with node_names 
184 |         trees_df$child <- do.call(c, cascades$cascade_nodes)
185 |         trees_df <- trees_df[!is.na(trees_df[, 2]), ]
186 |         trees_df[, 1] <- cascades$node_names[(trees_df[, 1] + 1)]
187 |         casc_names <- names(cascades$cascade_nodes)
188 |         trees_df[, 3] <- casc_names[trees_df[, 3]] 
189 |         colnames(trees_df) <- c("parent", "log_score", "cascade_id", "child") 
190 |         trees_df <- trees_df[, c(1, 4, 2, 3)]
191 |         return(list('network' = network, 'trees' = trees_df))
192 |     }
193 |     else return(network) 
194 | }
195 | 
196 | 
197 | #' Is the object of class diffnet?
198 | #' 
199 | #' Tests if an object is of class diffnet. The class diffnet is appended to the 
200 | #' object returned by \code{\link{netinf}} for dispatch of appropriate plotting 
201 | #' methods.
202 | #' 
203 | #' @param object the object to be tested.
204 | #' 
205 | #' @return \code{TRUE} if object is a diffnet, \code{FALSE} otherwise.
206 | #' 
207 | #' @examples
208 | #' 
209 | #' data(cascades)
210 | #' result <- netinf(cascades, n_edges = 6, params = 1)
211 | #' is.diffnet(result)
212 | #' @export
213 | is.diffnet <- function(object) {
214 |     inherits(object, "diffnet")
215 | }
216 | 
217 | 
218 | #' Count the number of possible edges in the dataset
219 | #' 
220 | #' Across all cascades, count the edges that are possible. An edge from node
221 | #' \code{u} to node \code{v}
222 | #' is only possible if in at least one cascade \code{u} experienced an event 
223 | #' before \code{v}.
224 | #' 
225 | #' @param cascades Object of class cascade containing the data.
226 | #' 
227 | #' @return An integer count.
228 | #' 
229 | #' @examples
230 | #' data(cascades)
231 | #' count_possible_edges(cascades)
232 | #' 
233 | #' @export 
234 | count_possible_edges <- function(cascades) {
235 |     # Check inputs
236 |     assert_that(is.cascade(cascades))
237 |     
238 |     # Assign integer node ids
239 |     # Note that the ids start at 0 (c++ is 0 indexed)
240 |     node_ids <- c(0:(length(cascades$node_names) - 1))
241 |     names(node_ids) <- cascades$node_names
242 |     
243 |     # Transform node ids in cascades to integer ids
244 |     cascade_nodes <- lapply(cascades$cascade_nodes, function(x) node_ids[x]) 
245 |  
246 |     n <- count_possible_edges_(cascade_nodes, cascades$cascade_times)
247 |     return(n) 
248 | }


--------------------------------------------------------------------------------
/src/netinf.cpp:
--------------------------------------------------------------------------------
  1 | // [[Rcpp::depends(RcppProgress)]]
  2 | #include <progress.hpp>
  3 | #include <progress_bar.hpp>
  4 | #include <Rcpp.h>
  5 | #include "netinf_utilities.h"
  6 | #include "vuong_test.h"
  7 | #include "possible_edges.h"
  8 | #include "spanning_tree.h"
  9 | #include "netinf.h"
 10 | #include "distributions.h"
 11 | 
 12 | using namespace Rcpp;
 13 | 
 14 | typedef edge_map::iterator m_iter;
 15 | typedef edge_map::reverse_iterator rm_iter;
 16 | 
 17 | // [[Rcpp::export]]
 18 | List netinf_(List &cascade_nodes, List &cascade_times, int &n_edges, 
 19 |              std::string &model, NumericVector &params, 
 20 |              bool quiet, bool &auto_edges, double &cutoff) {
 21 |     
 22 |     // Prepare the trees of each cascade (find the optimal spanning tree and 
 23 |     // store parents for each node and respective scores)
 24 |     if(!quiet) Rcout << "Initializing trees...\n";
 25 |     List trees_data = initialize_trees(cascade_nodes, cascade_times, params, 
 26 |                                        model);
 27 |     List trees = trees_data[0];
 28 |     NumericVector tree_scores = trees_data[1];
 29 |     
 30 |     // Get edges that are possible given the cascade data
 31 |     edge_map possible_edges = get_possible_edges_(cascade_nodes, cascade_times,
 32 |                                                   quiet);
 33 |    
 34 |     // Output containers
 35 |     int n_p_edges = possible_edges.size();
 36 |     if(auto_edges) n_edges = n_p_edges;
 37 |     List edges; 
 38 |     NumericVector scores;
 39 |     NumericVector p_values;
 40 |     
 41 |     if(n_edges > n_p_edges) {
 42 |         std::string msg = "Argument `n_edges` exceeds the maximal number of possible edges (which is " +
 43 |             std::to_string(n_p_edges) + ").\n";
 44 |         throw std::invalid_argument(msg);
 45 |     }
 46 |    
 47 |     if(!quiet) {
 48 |         if(auto_edges) Rcout << "Inferring edges using p-value cutoff...\n";
 49 |         else Rcout << "Inferring " << n_edges << " edges...\n";
 50 |     }
 51 |     
 52 |     // Set up for timing first iteration and progress bar (if not auto edges)
 53 |     bool show_progress = true;
 54 |     if(quiet) show_progress = false;
 55 |     if(auto_edges) show_progress = false;
 56 |     Progress p(n_edges, show_progress);
 57 |     
 58 |     int e;
 59 |     int check_interval = (n_p_edges / 10) + 1;
 60 |     id_array previous_best_edge = {{-1, -1}};
 61 |     NumericVector improvements(n_p_edges);
 62 |     
 63 |     for(e = 0; e < n_edges; e++) {
 64 |         
 65 |         m_iter start_iter = possible_edges.begin();
 66 |         id_array end_id = possible_edges.rbegin()->first;
 67 |         if(e > 0) {
 68 |             // Find the first edge in possible_edges that has the same child as 
 69 |             // previous_best_edge by iterating back from previous_best_edge
 70 |             m_iter it_best = possible_edges.find(previous_best_edge);
 71 |             int current_child = previous_best_edge[0];
 72 |             id_array last_key = it_best->first;
 73 |             for(m_iter rit = it_best; rit->first[0] == current_child; rit--) {
 74 |                 last_key = rit->first;
 75 |                 if(rit == possible_edges.begin()) break;
 76 |             }
 77 |             
 78 |             // And store the iterator as start point for edge inference 
 79 |             start_iter = possible_edges.find(last_key);
 80 |             
 81 |             // Find the last edge in possible_edges with the same child as 
 82 |             // previous_best_edges
 83 |             for(m_iter rit = it_best; rit != possible_edges.end(); rit++) {
 84 |                 if(rit->first[0] != current_child) break;
 85 |                 last_key = rit->first;
 86 |             }
 87 |             // And store the iterator as start point for edge inference 
 88 |             end_id = possible_edges.find(last_key)->first;
 89 |         } else {
 90 |             // In the first iteration we have to check every edge
 91 |         }
 92 |         
 93 |         int i = 0;
 94 |         for (m_iter x=start_iter; x!=possible_edges.end(); x++) {
 95 |             
 96 |             // Skip edge inferred in last iteration 
 97 |             if(x->first == previous_best_edge) continue;
 98 |            
 99 |             //potential parent
100 |             int parent = x->first[1];
101 |             // infected node
102 |             int child = x->first[0];
103 |             
104 |             //find replacements for u->v edge
105 |             List edge_replacements = tree_replacement(parent, child, 
106 |                                                       possible_edges, 
107 |                                                       cascade_times, 
108 |                                                       cascade_nodes, trees, 
109 |                                                       model, params);
110 |             // Store the updated potential improvement value for this edge
111 |             x->second.second = edge_replacements[0];
112 |             
113 |             // Check for user interrupt and update progress bar
114 |             if((i % check_interval) == 0) {
115 |                 checkUserInterrupt();
116 |             }
117 |             i += 1;
118 |             
119 |             // Stop when last edge to check is reached 
120 |             if(x->first == end_id) break;
121 |         }
122 |         
123 |         // Erase the previous best edge from possible edges
124 |         possible_edges.erase(previous_best_edge);
125 |         
126 |         // Check all improvements to find the best edge
127 |         double max_improvement = 0;
128 |         id_array best_edge;
129 |         for(m_iter x = possible_edges.begin(); x != possible_edges.end(); x++) {
130 |             if(x->second.second >= max_improvement) {
131 |                 max_improvement = x->second.second;
132 |                 best_edge = x->first;
133 |             }
134 |         }
135 |         // Re calculate the replacement data for the best edge
136 |         List best_edge_replacement_data = tree_replacement(best_edge[1], 
137 |                                                            best_edge[0], 
138 |                                                            possible_edges, 
139 |                                                            cascade_times, 
140 |                                                            cascade_nodes, trees, 
141 |                                                            model, params);
142 |         
143 |         // Store the best results
144 |         // Put edge in order parent->child for backwards compatibility
145 |         id_array best_edge_out = {{best_edge[1], best_edge[0]}};
146 |         edges.push_back(best_edge_out);
147 |         scores.push_back(max_improvement);
148 |         
149 |         // Update the trees with the new edge
150 |         NumericVector old_tree_scores = copy_vector(tree_scores);
151 |         update_trees(trees, tree_scores, best_edge_replacement_data, 
152 |                      cascade_nodes, best_edge);
153 |        
154 |         // Test if the edge improves fit
155 |         double p_value = vuong_test(old_tree_scores, tree_scores);
156 |         p_values.push_back(p_value);
157 | 
158 |         // Store the best edge for this iteration to inform what to iterate over
159 |         // for the next edge
160 |         previous_best_edge = best_edge;
161 |        
162 |         if(!auto_edges & !quiet) p.increment();
163 |         
164 |         if(!quiet & auto_edges){
165 |             Rcout << "\r" << (e+1) << " edges inferred. P-value: " << 
166 |                 p_value << std::flush;         
167 |         } 
168 |         
169 |         if(auto_edges & (p_value >= cutoff)) {
170 |             if(!quiet) Rcout << "\nReached p-value cutoff. Stopping.\n";
171 |             break;
172 |         }
173 |     }
174 |     
175 |     // Write out message if maximum number of edges has been reach below cutoff
176 |     if(auto_edges & (e == n_edges)) {
177 |         if(!quiet) Rcout << "Reached maximum number of possible edges" <<
178 |             " before p-value cutoff.\n";
179 |     }
180 |     List out = List::create(edges, scores, trees, p_values);
181 |     return out;
182 | }
183 | 
184 | List tree_replacement(int &parent, int &child, edge_map &possible_edges,
185 |                       List &cascade_times, List &cascade_nodes,
186 |                       List &trees, std::string &model, NumericVector &params) {
187 |     
188 |     // Get the cascades the edge is possible in:
189 |     std::array<int, 2> pair_id = {{child, parent}};
190 |     std::vector<int> cascades = possible_edges.find(pair_id)->second.first;
191 |     int n_possible_cascades = cascades.size();
192 |     
193 |     // Initialize output containers
194 |     IntegerVector cascades_with_replacement(n_possible_cascades, -1);
195 |     NumericVector replacement_scores(n_possible_cascades, NA_REAL);
196 |     
197 |     // Total improvement achieved by this edge across all trees
198 |     double improvement = 0;
199 |     for(int c = 0; c < cascades.size(); c++) {
200 |         
201 |         // Get pointers to the data of current cascade
202 |         int this_cascade = cascades[c];
203 |         IntegerVector this_cascade_nodes = cascade_nodes[this_cascade];
204 |         NumericVector this_cascade_times = cascade_times[this_cascade];
205 |         
206 |         // Get the event time for u and v in current cascade 
207 |         int idx_parent = get_index(this_cascade_nodes, parent);
208 |         int idx_child = get_index(this_cascade_nodes, child);
209 |         double timing_parent = this_cascade_times[idx_parent];
210 |         double timing_child = this_cascade_times[idx_child];
211 |         
212 |         // extract score associated with the current parent of child
213 |         List this_tree = trees[this_cascade];
214 |         NumericVector scores = this_tree[1];
215 |         double current_score = scores[idx_child];
216 |        
217 |         // what would the score be with the propspective parent (u)
218 |         double replacement_score = edge_score(timing_parent, timing_child, 
219 |                                               model, params, true);
220 |         
221 |         // If the edge has a higher score add it to overall improvement and 
222 |         // store the cascade the improvement occured in (and the new score)
223 |         if(replacement_score > current_score) {
224 |             improvement += replacement_score - current_score; 
225 |             cascades_with_replacement[c] = this_cascade;
226 |             replacement_scores[c] = replacement_score;
227 |         }
228 |     }
229 |    
230 | 
231 |     List out = List::create(improvement, cascades_with_replacement, 
232 |                             replacement_scores);
233 |     return out;
234 | }
235 | 
236 | void update_trees(List &trees, NumericVector &tree_scores, 
237 |                   List &replacement_data, List &cascade_nodes, 
238 |                   id_array best_edge) {
239 |    
240 |     IntegerVector updated_cascades = replacement_data[1];
241 |     NumericVector replacement_scores = replacement_data[2];       
242 |     NumericVector old_tree_scores = copy_vector(tree_scores);
243 |     
244 |     // Get u and v of best edge
245 |     int parent = best_edge[1];
246 |     int child = best_edge[0];
247 |     for(int i = 0; i < updated_cascades.size(); i++) {
248 |         int this_cascade = updated_cascades[i];
249 |         if(this_cascade < 0) continue;
250 |         IntegerVector this_cascade_nodes = cascade_nodes[this_cascade];
251 |         int idx_child = get_index(this_cascade_nodes, child);
252 |         List casc_tree = trees[this_cascade];
253 | 
254 |         IntegerVector this_parents = casc_tree[0];
255 |         NumericVector this_scores = casc_tree[1];
256 |         
257 |         //update parent id for v
258 |         this_parents[idx_child] = parent;
259 |         // update branch score
260 |         this_scores[idx_child] = replacement_scores[i];
261 |         tree_scores[this_cascade] = sum_vector(this_scores);
262 |     }
263 | }
264 | 


--------------------------------------------------------------------------------
/R/simulate_cascades.R:
--------------------------------------------------------------------------------
  1 | #' Simulate a set of random cascades
  2 | #'
  3 | #' Simulate random cascades, for testing and demonstration purposes. No actual 
  4 | #' diffusion model is underlying these cascades.
  5 | #' 
  6 | #' @importFrom stats runif
  7 | #' 
  8 | #' @param n_cascades Number of cascades to generate.
  9 | #' @param n_nodes Number of nodes in the system.
 10 | #'     
 11 | #' @return A data frame containing (in order of columns) node ids, 
 12 | #'     event time and cascade identifier.
 13 | #'     
 14 | #' @examples
 15 | #' 
 16 | #' df <- simulate_rnd_cascades(10, n_nodes = 20)
 17 | #' head(df)
 18 | #' 
 19 | #' @export
 20 | simulate_rnd_cascades <- function(n_cascades, n_nodes) {
 21 |     qassert(n_cascades, "X1[1,)")
 22 |     
 23 |     make_cascade_ <- function(cid) {
 24 |         n <- runif(1, 1, n_nodes)
 25 |         ids <- as.character(sample(1:n_nodes, n, replace = FALSE))
 26 |         times <- sort(runif(n, 0, 30), decreasing = TRUE)
 27 |         return(data.frame(ids, times, rep(cid, n), stringsAsFactors = FALSE))
 28 |     }
 29 |      
 30 |     cascades <- do.call(rbind, lapply(sample(c(1:n_cascades), n_cascades, 
 31 |                                              replace = FALSE), make_cascade_))
 32 |     colnames(cascades) <- c("node_name", "event_time", "cascade_id")
 33 |     rownames(cascades) <- as.character(c(1:nrow(cascades)))
 34 |     return(cascades)
 35 | }
 36 | 
 37 | 
 38 | #' Simulate cascades from a diffusion network
 39 | #'
 40 | #' Simulate diffusion cascades based on the generative model underlying netinf
 41 | #' and a diffusion network.
 42 | #' 
 43 | #' @import assertthat
 44 | #' 
 45 | #' @param diffnet object of class \code{diffnet}.
 46 | #' @param nsim integer, number of cascades to simulate.
 47 | #' @param max_time numeric, the maximum time after which observations are 
 48 | #'     censored
 49 | #' @param start_probabilities a vector of probabilities for each node in diffnet,
 50 | #'     to be the node with the first event. If \code{NULL} a node is drawn from
 51 | #'     a uniform distribution over all nodes.
 52 | #' @param partial_cascade object of type cascade, containing one partial 
 53 | #'     cascades for which further development should be simulated.
 54 | #' @param params numeric, (optional) parameters for diffusion time distribution. 
 55 | #'    See the details section of \code{\link{netinf}} for specification details.
 56 | #'    Only use this argument if parameters different from those contained in the 
 57 | #'    \code{diffnet} object should be used or the network is not an object of 
 58 | #'    class \code{diffnet}.
 59 | #' @param model character, diffusion model to use. One of \code{c("exponential", 
 60 | #'     "rayleigh", "log-normal")}. Only use this argument if parameters different 
 61 | #'     from those contained in the \code{diffnet} object should be used or the 
 62 | #'     network is not an object of class \code{diffnet}.
 63 | #' @param nodes vector of node ids if different from nodes included in 
 64 | #'     \code{diffnet}
 65 | #'     
 66 | #' @return A data frame with three columns. Containing 1) The names of 
 67 | #'     the nodes (\code{"node_name"}) that experience an event in each cascade, 
 68 | #'     2) the event time (\code{"event_time"}) of the corresponding node, 
 69 | #'     3) the cascade identifier \code{"cascade_id"}.
 70 | #'     
 71 | #' @examples
 72 | #' 
 73 | #' data(cascades) 
 74 | #' out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1)
 75 | #' simulated_cascades <- simulate_cascades(out, nsim = 10)
 76 | #'  
 77 | #' # Simulation from partial cascade
 78 | #' 
 79 | #' @export
 80 | simulate_cascades <- function(diffnet, nsim = 1, max_time = Inf, 
 81 |                               start_probabilities = NULL,
 82 |                               partial_cascade = NULL, params = NULL, 
 83 |                               model = NULL, nodes = NULL) {
 84 |     # Check inputs
 85 |     assert_that(is.diffnet(diffnet))
 86 |     assert_that(nsim >= 1)
 87 |     
 88 |     if(is.null(model)) {
 89 |         model <- attr(diffnet, "diffusion_model")
 90 |     }
 91 |     if(is.null(params)) {
 92 |         params <- attr(diffnet, "diffusion_model_parameters")
 93 |     }
 94 |     model <- match.arg(model, c("exponential", "rayleigh", "log-normal"))
 95 |     if(model == "rayleigh") {
 96 |         stop("Rayleigh distribution is not implemented yet. Please choose the exponential or log-normal diffusion model.")
 97 |     }
 98 |     if(is.null(nodes)) {
 99 |         nodes <- unique(c(diffnet$origin_node, diffnet$destination_node)) 
100 |         if(any(!is.element(partial_cascade$cascade_nodes[[1]], nodes))) {
101 |             # TODO: This could be a warning and nodes that are not in the network
102 |             # could be dropped from the partial cascade
103 |             stop("There are nodes in the partial cascade that are not part of the diffusion network. Dropping these nodes. If these nodes should be included for potential out-of-network diffusion, please provide them via the `nodes` argument.")
104 |             #overlap = partial_cascade$cascade_nodes[[1]][
105 |             #    is.element(partial_cascade$cascade_nodes[[1]], nodes)]
106 |             #partial_cascade = subset_cascade(partial_cascade, overlap)
107 |         }
108 |     }
109 |     n_nodes <- length(nodes)
110 |     
111 |     if(!is.null(start_probabilities) & !is.null(partial_cascade)) {
112 |         stop("Start probabilities are not allowed with partial cascades")
113 |     }
114 |     
115 |     if(is.null(start_probabilities)) {
116 |         start_probabilities <- rep(1/n_nodes, n_nodes)
117 |     }
118 |     # Check start probabilities
119 |     qassert(start_probabilities, paste0('N', n_nodes, '[0,1]'))
120 |     
121 |     # Check partial cascade input 
122 |     if(!is.null(partial_cascade)) {
123 |         assert_that(is.cascade(partial_cascade))
124 |         assert_that(length(partial_cascade$cascade_nodes) == 1)
125 |         assert_that(length(partial_cascade$cascade_times) == 1)
126 |         see_if(all(is.element(partial_cascade$cascade_nodes[[1]], nodes)))
127 |     }
128 |     
129 |     ## Create adjacency matrix from edgelist (ordered as in nodes, row -> sender)
130 |     X_ <- matrix(0, ncol = n_nodes, nrow = n_nodes)
131 |     for(k in 1:nrow(diffnet)) {
132 |         i <- which(nodes == diffnet[k, 1])
133 |         j <- which(nodes == diffnet[k, 2])
134 |         X_[i, j] <- 1
135 |     }
136 |     
137 |     sim_out <- lapply(X = 1:nsim, FUN = simulate_cascade_, n_nodes = n_nodes, 
138 |                       params = params, max_time = max_time, 
139 |                       model = model, nodes = nodes, X_ = X_, 
140 |                       partial_cascade = partial_cascade, 
141 |                       start_probabilities = start_probabilities)
142 |     out <- do.call(rbind, sim_out)
143 |     rownames(out) <- NULL
144 |     return(out) 
145 | }
146 | 
147 | 
148 | rltruncexp <- function(n, rate, ltrunc) {
149 |     stats::qexp(stats::runif(n, min = stats::pexp(ltrunc, rate = rate), 
150 |                              max = 1), rate = rate)
151 | }
152 | 
153 | rltrunclnorm <- function(n, meanlog, sdlog, ltrunc) {
154 |     stats::qlnorm(stats::runif(n, min = stats::plnorm(ltrunc, meanlog = meanlog,
155 |                                                       sdlog = sdlog)),
156 |                   meanlog = meanlog, sdlog = sdlog)
157 | }
158 | 
159 | # This function generates a matrix of relative diffusion times (rows senders,
160 | # columns receivers) where all diffusion times sent by the nodes involved in the
161 | # partial cascade are left truncated to be at least long enough to not infect 
162 | # any nodes before the absolute left censoring time (the last infection time
163 | # in the partial cascade).
164 | truncated_rel_diff_times <- function(n_nodes, nodes, partial_cascade, model, 
165 |                                      params) {
166 |     out <- matrix(stats::rexp(n_nodes^2, rate = params), nrow = n_nodes)
167 |     rownames(out) <- colnames(out) <- nodes
168 |     
169 |     # Get the truncation point for each node in the partial cascade     
170 |     pc_times <- partial_cascade$cascade_times[[1]]
171 |     trunc_points <- pc_times[length(pc_times)] - pc_times
172 |    
173 |     # Truncated draws for nodes in partial cascade 
174 |     pc_nodes = partial_cascade$cascade_nodes[[1]]
175 |     
176 |     if(model == 'exponential') {
177 |         out[pc_nodes, ] = t(sapply(trunc_points, 
178 |                                    function(x) rltruncexp(n_nodes, params, x)))
179 |     } else if(model == "log-normal") {
180 |         out[pc_nodes, ] = t(sapply(trunc_points, 
181 |                                    function(x) rltrunclnorm(n_nodes, params[1], 
182 |                                                             params[2], x)))
183 |     }
184 |     return(out)    
185 | }
186 | 
187 | # Simulate a single cascade from scratch (random first event node)
188 | simulate_cascade_ <- function(i, nodes, n_nodes, params, max_time, model, X_, 
189 |                               partial_cascade, start_probabilities) {
190 |     beta = 0.5
191 |     epsilon = 10e-9
192 |    
193 |     # Generate relative diffusion times for all pairs
194 |     if(is.null(partial_cascade)) {
195 |         if(model == "exponential") {
196 |             rel_diff_times <- matrix(stats::rexp(n_nodes^2, rate = params), 
197 |                                      nrow = n_nodes)
198 |         } else if(model == "log-normal") {
199 |             rel_diff_times <- matrix(stats::rlnorm(n_nodes^2, meanlog = params[1],
200 |                                                    sdlog = params[2]), 
201 |                                      nrow = n_nodes)       
202 |         }
203 |         rownames(rel_diff_times) <- colnames(rel_diff_times) <- nodes
204 |     } else {
205 |         rel_diff_times <- truncated_rel_diff_times(n_nodes, nodes, 
206 |                                                    partial_cascade, model, 
207 |                                                    params)
208 |     }
209 |    
210 |    
211 |     # No diffusion of node to itself
212 |     diag(rel_diff_times) <- 0
213 |     if(!is.null(partial_cascade)) {
214 |         # No diffusion to nodes that already had an event in partial_cascade
215 |         idx <- which(is.element(nodes, partial_cascade$cascade_nodes[[1]]))
216 |         rel_diff_times[, idx] <- 0       
217 |     }
218 |     # Censor at maximum observation time
219 |     rel_diff_times[rel_diff_times > max_time] <- 0
220 |     
221 |     # Create matrix to transform network for out of network diffusion 
222 |     norm_epsilon <- epsilon / (epsilon + beta)
223 |     Y <- matrix(0, ncol = n_nodes, nrow = n_nodes) 
224 |     # Set colums to 1 with probability norm_ep
225 |     Y[, as.logical(stats::rbinom(n_nodes, 1, prob = norm_epsilon))] <- 1
226 |     
227 |     # Set relative diffusion times between nodes with no corresponding diffusion
228 |     # -edge to 0 
229 |     rel_diff_times <- (X_ - Y)^2 * rel_diff_times
230 |     
231 |     if(is.null(partial_cascade)) {
232 |         start_nodes <- sample(x = nodes, size = 1, prob = start_probabilities)
233 |     } else {
234 |         start_nodes <- nodes[idx]
235 |     }
236 |     # Find shortest path from start node to every other (reachable) node
237 |     if (requireNamespace("igraph", quietly = TRUE)) {
238 |         g <- igraph::graph.adjacency(rel_diff_times, weighted = TRUE, 
239 |                                  mode = "directed")
240 |         dists <- igraph::distances(g, v = start_nodes, mode = "out")
241 |         # reorder
242 |         if(!is.null(partial_cascade)) {
243 |             dists <- dists[partial_cascade$cascade_nodes[[1]], , drop = FALSE]
244 |         }
245 |         
246 |     } else {
247 |         stop("In order to use this functionality the `igraph` package needs to be installed. Run `install.packages('igraph')` and retry.")
248 |     } 
249 |     
250 |     if(is.null(partial_cascade)) {
251 |         prev_event_times <- 0
252 |     } else {
253 |         prev_event_times <- partial_cascade$cascade_times[[1]]
254 |     }
255 |     # Find the earliest event time for each node
256 |     ## Add the event time of each node in partial_cascade to shortest path 
257 |     ## from this node (or add 0 if no partial cascade)
258 |     abs_dists <- t(sapply(1:nrow(dists), 
259 |                           function(j) dists[j, ] + prev_event_times[j]))
260 |     d <- apply(abs_dists, 2, min) 
261 |     d <- d[!is.infinite(d)]
262 |     
263 |     out <- data.frame("node_name" = names(d), 
264 |                       "event_time" = as.numeric(d),
265 |                       "cascade_id" = i)
266 |     return(out)
267 | }
268 | 
269 | 


--------------------------------------------------------------------------------
/vignettes/tutorial_vignette.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NetworkInference Tutorial: Persistent Policy Diffusion Ties"
  3 | author: "Fridolin Linder and Bruce Desmarais"
  4 | date: "`r Sys.Date()`"
  5 | output: rmarkdown::html_vignette
  6 | vignette: >
  7 |   %\VignetteIndexEntry{NetworkInference Tutorial: Persistent Policy Diffusion Ties}
  8 |   %\VignetteEngine{knitr::rmarkdown}
  9 |   %\VignetteEncoding{UTF-8}
 10 | ---
 11 | 
 12 | ```{r setup, include = FALSE}
 13 | knitr::opts_chunk$set(
 14 |   collapse = TRUE,
 15 |   comment = "#>"
 16 | )
 17 | ```
 18 | 
 19 | 
 20 | ---
 21 | 
 22 | # Introduction
 23 | 
 24 | ---
 25 | 
 26 | In this tutorial we go through some of the functionality of the `NetworkInference` package using the example application from Desmarais et al. (2015) and extending it with the recently released [State Policy Innovation and Diffusion](https://doi.org/10.7910/DVN/CVYSR7) (SPID) database. In this paper Desmarais et al. infer a latent network for policy diffusion based on adoption of policies in the US states.
 27 | 
 28 | `netinf` infers the optimal diffusion network from a set of *nodes* (in this case US-states) and a number of so called *cascades* (here a cascade corresponds to a policy that is adopted by at least two states). A cascade is a series of events occurring at 
 29 | a specified time. 
 30 | 
 31 | ## Installing the package
 32 | 
 33 | Installing `NetworkInference`:
 34 | ```{r, eval=FALSE}
 35 | install.packages('NetworkInference')
 36 | ```
 37 | 
 38 | 
 39 | Some other packages required for this tutorial:
 40 | ```{r, eval=FALSE}
 41 | install.packages(c('dplyr', 'igraph'))
 42 | ```
 43 | 
 44 | ## Exploring SPID Data
 45 | 
 46 | The policy adoption data is available in the package:
 47 | 
 48 | ```{r, message=FALSE}
 49 | library(NetworkInference)
 50 | # Load the `policies` dataset (?policies for details).
 51 | data('policies')
 52 | ```
 53 | 
 54 | This loads two `data.frame`-objects. `policies` contains the adoption events and `policies_metadata` contains additional information on each policy.
 55 | 
 56 | ```{r, eval=FALSE}
 57 | head(policies)
 58 | ```
 59 | 
 60 | ```{r, results="asis", echo=FALSE}
 61 | pander::pandoc.table(head(policies))
 62 | ```
 63 | 
 64 | The first column (`statenam`) gives the state that adopted the `policy` in year `adopt_year`. Let's take a closer look at the data.
 65 | 
 66 | Number of policies:
 67 | ```{r}
 68 | length(unique(policies$policy))
 69 | ```
 70 | Number of adoption events:
 71 | ```{r}
 72 | nrow(policies)
 73 | ```
 74 | Some example policies:
 75 | ```{r}
 76 | unique(policies$policy)[1:5]
 77 | ```
 78 | 
 79 | Not all the policy abbreviations are understandable. So let's check the metadata for more information:
 80 | ```{r, eval=FALSE}
 81 | library(dplyr)
 82 | filter(policies_metadata, policy %in% unique(policy)[100:104]) %>%
 83 |     select(-source)
 84 | ```
 85 | ```{r, results="asis", echo=FALSE, message=FALSE}
 86 | library(dplyr)
 87 | pander::pandoc.table(filter(policies_metadata, policy %in% unique(policy)[100:104]) %>%
 88 |                          select(-source))
 89 | ```
 90 | 
 91 | 
 92 | ## Preparing the Data for NetworkInference
 93 | 
 94 | Most functionality of the `NetworkInference` package is based on the `cascades` 
 95 | data format. So before starting with the analysis we have to transform our data
 96 | to such an object.
 97 | 
 98 | ```{r}
 99 | policy_cascades <- as_cascade_long(policies, cascade_node_name = 'statenam',
100 |                                    event_time = 'adopt_year', 
101 |                                    cascade_id = 'policy')
102 | ```
103 | 
104 | In this case we used the function `as_cascade_long`. If your data is in wide format you can convert it using the function `as_cascade_wide`.
105 | 
106 | The `cascade` class contains the same data as the `policies` `data.frame`, just in a different format. You don't need to understand how the object is constructed but let's take a look for clarity:
107 | 
108 | ```{r}
109 | class(policy_cascades)
110 | length(policy_cascades)
111 | names(policy_cascades)
112 | ```
113 | 
114 | The `cascade` class is basically a list containing three elements: 
115 | 
116 | ```{r}
117 | policy_cascades$cascade_nodes[2:3]
118 | ```
119 | 
120 | ```{r}
121 | policy_cascades$cascade_times[2:3]
122 | ```
123 | 
124 | There are a few convenience functions to manipulate the cascades (but you can also manipulate the data before converting it to the `cascade` format).
125 | 
126 | ### Subsetting by Cascade
127 | ```{r}
128 | selected_policies <- subset_cascade(cascade = policy_cascades, 
129 |                                     selection = c('clinic_access', 'cogrowman'))
130 | selected_policies[1:2]
131 | ```
132 | 
133 | ## Subsetting in Time
134 | 
135 | ```{r}
136 | time_constrained <- subset_cascade_time(cascade = selected_policies, 
137 |                                         start_time = 1990, end_time = 2000)
138 | time_constrained[1:2]
139 | ```
140 | 
141 | ## Removing Nodes (States)
142 | 
143 | ```{r}
144 | less_nodes <- drop_nodes(cascades = time_constrained, 
145 |                          nodes = c('Maryland', 'Washington'))
146 | less_nodes[1:2]
147 | ```
148 | 
149 | ### Visually Inspecting Cascades
150 | 
151 | It's always good practice to visually inspect the data before working with it. 
152 | The `NetworkInference` package provides functionality to visualize the cascade 
153 | data. 
154 | 
155 | The function `summary.cascades()` provides quick summary statistics on the 
156 | cascade data:
157 | 
158 | ```{r}
159 | summary(policy_cascades)
160 | ```
161 | 
162 | The first four lines provide the number of cascades, the number of nodes in the 
163 | system, the number of nodes involved in cascades (there might be states that we 
164 | don't have diffusion data on, but we still want them represented in the dataset)
165 | and the possible number of edges in a potential diffusion network (a diffusion edge 
166 | between nodes `u` and `v` only makes sense if there is at least one cascade in 
167 | which `u` experiences an event before `v`). In this example there are 187 
168 | policies and 50 states. Each state is involved in at least one policy cascade 
169 | and a fully connected diffusion network would have 2450 edges. 
170 | 
171 | It also provides summary statistics on the distribution of the cascade lengths 
172 | (number of nodes involved in each cascade) and the number of ties in the 
173 | cascades (two nodes experiencing the same event at the same time). For our 
174 | example, we can see that the 'smallest' policy was adopted by 10 states and the
175 | 'largest' by all 50 states. From the tie summaries we see that there is at least
176 | one policy that was adopted by 45 states in the same year. 
177 | 
178 | The `plot()` method allows to plot cascades with varying degrees of detail. The 
179 | argument `label_nodes` (`TRUE/FALSE`) provides node labels which require more space
180 | but provide more detail. The argument `selection` allows to pick a subset of 
181 | cascades to visualize in case there are too many to plot. If `label_nodes` is set 
182 | to `FALSE` each event is depicted by a dot, which allows to visualize more cascades
183 | simultaneously. 
184 | 
185 | Let's first look at the visualization with labels. Here we plot two cascades, 
186 | selected by their name:
187 | ```{r, fig.align='center', fig.width=7, fig.height=4}
188 | selection <- c('guncontrol_assaultweapon_ba', 'guncontrol_licenses_dealer')
189 | plot(policy_cascades, label_nodes = TRUE, selection = selection)
190 | ```
191 | 
192 | We can also plot more cascades with less detail:
193 | ```{r, fig.align='center', fig.width=7, fig.height=4}
194 | selection <- c('waiting', 'threestrikes', 'unionlimits', 'smokeban', 
195 |                'paperterror', 'miglab', 'methpre', 'lott', 'lemon', 'idtheft',
196 |                'harass', 'hatecrime', 'equalpay')
197 | plot(policy_cascades, label_nodes = FALSE, selection = selection)
198 | ```
199 | 
200 | This produces a ['violin plot'](https://en.wikipedia.org/wiki/Violin_plot) for each cascade with the single diffusion events overplotted as dots. As we already saw in the previous visualization, the policy data has a lot of ties (i.e. many states adopted a policy in the same year) which is indicated by the areas of higher density in the violin plot.
201 | 
202 | 
203 | ## Inferring the Latent Diffusion Network
204 | 
205 | The `netinf` algorithm is implemented in the `netinf()` function. The `netinf` inferrs edges based on a diffusion model. That is we assume a parametric model for the diffusion times between edges. Currently three different diffusion models are implemented: The exponential distribution, the rayleigh distribution and the log-normal distribution. The model can be chosen with the `trans_mod` argument (default is the exponential distribution). 
206 | 
207 | ### Classic Netinf
208 | 
209 | In the original implementation the number of edges to infer had to be fixed and chosen by the researcher. If you want to run `netinf` in this classic way you can do so by specifiying all parameters and the number of edges:
210 | 
211 | ```{r}
212 | results <- netinf(policy_cascades, trans_mod = "exponential", n_edges = 100, 
213 |                   params = 0.5, quiet = TRUE)
214 | ```
215 | 
216 | The exponential model has only one parameter (lambda or the rate). If there are more parameters the details section in the documentation of the `netinf` function (`?netinf`) has more detail on how to specify parameters and on the specific parametrization used by the implementation. 
217 | 
218 | `n_edges` specifies how many edges should be inferred. See @gomez2010inferring and @desmarais2015persistent for guidance on choosing this parameter if running netinf in classic mode. If the number of edges is specified manually, it has to be lower than the maximum number of possible edges. An edge `u->v` is only possible if in at least one cascade `u` experiences an event *before* `v`. This means, that the maximum number of edges depends on the data. The function `count_possible_edges()` allows us to compute the maximum number of edges in advance:
219 |  
220 | ```{r}
221 | npe <- count_possible_edges(policy_cascades)
222 | npe
223 | ```
224 | 
225 | 
226 | ### Automatic Parameter Selection
227 | 
228 | With version 1.2.0 `netinf` can be run without manually specifying the number of edges or the parameters of the diffusion model. 
229 | 
230 | #### Selecting the number of edges automatically
231 | 
232 | After each iteration of the netinf algorithm, we check if the edge added significant improvement to the network. This is done via a vuong style test. Given the likelihood score for each cascade conditional on the network inferred so far, we penalize the network with one addional edge and test if the increase in likelihood accross all cascades is significant. The user still has to specify a p-value cut-off. If the p-value of an edge is larger than the specified cut-off the algorithm stops inferring more edges. The cut-off is set via the `p_value_cutoff` argument.
233 | 
234 | ```{r}
235 | results <- netinf(policy_cascades, trans_mod = "exponential", 
236 |                   p_value_cutoff = 0.1, params = 0.5, quiet = TRUE)
237 | nrow(results)
238 | ```
239 | 
240 | We see that with a fixed lambda of 0.5 and a p-value cut-off of 0.1 the algorithm inferred 872 edges.
241 | 
242 | #### Selecting the parameters of the diffusion model
243 | 
244 | The diffusion model parameters can be selected automatically. Setting the `params` argument to `NULL` (default value) makes the `netinf` function initialize the parameters automatically. The parameters are initialized at the midpoint between the MLE of the minimum diffusion times and the MLE of the maximum diffusion times, across all cascades. Edges are then inferred until either the p-value cut-off or a manually specified number of edges (`n_edges`) is reached. 
245 | 
246 | ```{r}
247 | results <- netinf(policy_cascades, trans_mod = "exponential", 
248 |                   p_value_cutoff = 0.1, quiet = TRUE)
249 | nrow(results)
250 | ```
251 | 
252 | ### Netinf output
253 | 
254 | Let's take a look at the output of the algorithm. The output is a dataframe containing the inferred latent network in the form of an edgelist:
255 | 
256 | ```{r, eval=FALSE, echo=TRUE}
257 | head(results)
258 | ```
259 | ```{r, results = "asis", echo=FALSE}
260 | pander::pandoc.table(head(results))
261 | ```
262 | 
263 | Each row corresponds to a directed edge. The first column indicates the origin node, the second the destination node. The third column displays the gain in model fit from each added edge. The last column displays the p-value from the Vuong test of each edge. There is a generic plot method to inspect the results. If more tweaking is required, the results are a dataframe so it should be easy for the more experienced users to make your own plot. With `type = "improvement"` the improvement from each edge can be plotted:
264 | 
265 | ```{r, fig.align='center', fig.width=7, fig.height=4}
266 | plot(results, type = "improvement")
267 | ```
268 | 
269 | We can also quickly check the p-value from the Vuong test associated with each edge addition:
270 | 
271 | ```{r, fig.align='center', fig.width=7, fig.height=4}
272 | plot(results, type = 'p-value')
273 | ```
274 | 
275 | In order to produce a quick visualization of the resulting diffusion network we can use the plot method again, this time with `type = "network"`. Note that in order to use this functionality the igraph package has to be installed.
276 | 
277 | ```{r, fig.width=7, fig.height=5.5}
278 | #install.packages('igraph')
279 | # For this functionality the igraph package has to be installed
280 | # This code is only executed if the package is found:
281 | if(requireNamespace("igraph", quietly = TRUE)) {
282 |     plot(results, type = "network")
283 | }
284 | ```
285 | 
286 | If additional tweaking of the plot is desired, the network can be visualized using `igraph` explicitly. We refer you you to the [igraph documentation](https://CRAN.R-project.org/package=igraph) for details on how to customize the plot.
287 | 
288 | ```{r, message=FALSE, eval=FALSE}
289 | if(requireNamespace("igraph", quietly = TRUE)) {
290 |     library(igraph) 
291 |     g <- graph_from_data_frame(d = results[, 1:2])
292 |     plot(g, edge.arrow.size=.3, vertex.color = "grey70")
293 | }
294 | ```


--------------------------------------------------------------------------------
/R/cascade.R:
--------------------------------------------------------------------------------
  1 | # Functions related to the cascade class
  2 | 
  3 | #' Is the object of class cascade?
  4 | #' 
  5 | #' @param object the object to be tested.
  6 | #' 
  7 | #' @return \code{TRUE} if object is a cascade, \code{FALSE} otherwise.
  8 | #' 
  9 | #' @examples
 10 | #' 
 11 | #' data(cascades)
 12 | #' is.cascade(cascades)
 13 | #' # > TRUE
 14 | #' is.cascade(1) 
 15 | #' # > FALSE
 16 | #' @export
 17 | is.cascade <- function(object) {
 18 |     inherits(object, "cascade")
 19 | }
 20 | 
 21 | 
 22 | #' Transform long data to cascade 
 23 | #'
 24 | #' Create a cascade object from data in long format.
 25 | #' 
 26 | #' Each row of the data describes one event in the cascade. The data must 
 27 | #' contain at least three columns:
 28 | #' \enumerate{
 29 | #'    \item Cascade node name: The identifier of the node that experiences the 
 30 | #'        event.
 31 | #'    \item Event time: The time when the node experiences the event. Note that
 32 | #'        if the time column is of class date or any other special time class, 
 33 | #'        it will be converted to an integer with `as.numeric()`. 
 34 | #'    \item Cascade id: The identifier of the cascade that the event pertains to.
 35 | #' }
 36 | #' The default names for these columns are \code{node_name}, \code{event_time} 
 37 | #' and \code{cascade_id}. If other names are used in the \code{data} object the 
 38 | #' names have to be specified in the corresponding arguments (see argument 
 39 | #' documentation)
 40 | #' 
 41 | #' @import checkmate 
 42 | #' @import assertthat
 43 | #' 
 44 | #' @param data \link{data.frame}, containing the cascade data 
 45 | #'     with column names corresponding to the arguments provided to 
 46 | #'     \code{cascade_node_names}, \code{event_time} and \code{cascade_id}.
 47 | #' @param cascade_node_name character, column name of \code{data} that specifies 
 48 | #'     the node names in the cascade. 
 49 | #' @param event_time character, column name of \code{data} that specifies the 
 50 | #'     event times for each node involved in a cascade.
 51 | #' @param cascade_id character, column name of the cascade identifier.
 52 | #' @param node_names character, factor or numeric vector containing the names for each node. 
 53 | #'     Optional. If not provided, node names are inferred from the cascade data.
 54 | #'     
 55 | #' @return An object of class \code{cascade}. This is a list containing three
 56 | #'     (named) elements: 
 57 | #'     \enumerate{
 58 | #'         \item \code{"node_names"} A character vector of node names.
 59 | #'         \item \code{"cascade_nodes"} A list with one character vector per
 60 | #'             cascade containing the node names in order of the events.
 61 | #'         \item \code{"cascade_times"} A list with one element per cascade 
 62 | #'             containing the event times for the nodes in \code{"cascade_names"}.
 63 | #'     }
 64 | #' 
 65 | #' @examples 
 66 | #' 
 67 | #' df <- simulate_rnd_cascades(10, n_nodes = 20)
 68 | #' cascades <- as_cascade_long(df)
 69 | #' is.cascade(cascades)
 70 | #
 71 | #' @export
 72 | as_cascade_long <- function(data, cascade_node_name = "node_name", 
 73 |                              event_time = "event_time", 
 74 |                              cascade_id = "cascade_id", node_names = NULL) {
 75 |     # Check all inputs 
 76 |     data <- as.data.frame(data)
 77 |     if(is.null(node_names)) {
 78 |         node_names <- as.character(unique(data[, cascade_node_name]))
 79 |     }
 80 |     qassert(cascade_node_name, 'S1')
 81 |     qassert(event_time, 'S1')
 82 |     qassert(cascade_id, 'S1')
 83 |     assert_that(is.element(cascade_node_name, colnames(data)))
 84 |     assert_that(is.element(event_time, colnames(data)))
 85 |     assert_that(is.element(cascade_id, colnames(data)))
 86 |     assert_data_frame(data, min.rows = 1, min.cols = 3)
 87 |     
 88 |     # Transform the data  
 89 |     ## Transform cascade ids and node names to character to get consistency 
 90 |     ## down the line
 91 |     data[, cascade_node_name] <- as.character(data[, cascade_node_name])
 92 |     data[, cascade_id] <- as.character(data[, cascade_id])
 93 |     
 94 |     ## Transform to cascade data structure
 95 |     splt <- split(data, f = data[, cascade_id]) 
 96 |     cascade_nodes <- lapply(splt, function(x) x[, cascade_node_name])
 97 |     cascade_times <- lapply(splt, function(x) as.numeric(x[, event_time]))
 98 |     cascade_times <- lapply(cascade_times, as.numeric)
 99 |     names(cascade_nodes) <- names(splt)
100 |     names(cascade_times) <- names(splt)
101 |     
102 |     # Check if data is consistent
103 |     assert_cascade_consistency_(cascade_nodes, cascade_times, node_names)
104 |     
105 |     out <- list("cascade_nodes" = cascade_nodes, 
106 |                 "cascade_times" = cascade_times, 
107 |                 "node_names" = node_names)
108 |     class(out) <- c("cascade", "list")
109 |     
110 |     out <- order_cascade_(out)
111 |     
112 |     return(out)
113 |     
114 | }
115 | 
116 | 
117 | #' Transform wide data to cascade
118 | #' 
119 | #' Create a cascade object from data in wide format.
120 | #' 
121 | #' If data is in wide format, each row corresponds to a node and each column to
122 | #' a cascade. Each cell indicates the event time for a node - cascade 
123 | #' combination. If a node did not experience an event for a cascade (the node
124 | #' is censored) the cell entry must be \code{NA}.
125 | #' 
126 | #' @import checkmate 
127 | #' @import assertthat
128 | #' 
129 | #' @param data \link{data.frame} or \link{matrix}, rows corresponding to nodes, 
130 | #'     columns to cascades. Matrix entries are the event times for each node, 
131 | #'     cascade pair. Missing values indicate censored observations, that is, 
132 | #'     nodes that did not have an event). Specify column and row names if 
133 | #'     cascade and node ids other than integer sequences are  desired. Note that, 
134 | #'     if the time column is of class date or any other special time class, it 
135 | #'     will be converted to an integer with `as.numeric()`. 
136 | #' @param node_names character, factor or numeric vector, containing names for each node. 
137 | #'     Optional. If not provided, node names are inferred from the provided data.
138 | #'     
139 | #' @return An object of class \code{cascade}. This is a list containing three
140 | #'     (named) elements: 
141 | #'     \enumerate{
142 | #'         \item \code{"node_names"} A character vector of node names.
143 | #'         \item \code{"cascade_nodes"} A list with one character vector per
144 | #'             cascade containing the node names in order of the events.
145 | #'         \item \code{"cascade_times"} A list with one element per cascade 
146 | #'             containing the event times for the nodes in \code{"cascade_names"}.
147 | #'     }
148 | #
149 | #' @examples 
150 | #' 
151 | #' data("policies")
152 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
153 | #'                             event_time = 'adopt_year', cascade_id = 'policy')
154 | #' wide_policies = as.matrix(cascades)
155 | #' cascades <- as_cascade_wide(wide_policies)
156 | #' is.cascade(cascades)
157 | #' 
158 | #' @export
159 | #' 
160 | as_cascade_wide <- function(data, node_names = NULL) {
161 |     
162 |     # Check all inputs 
163 |     if(is.null(node_names)) {
164 |         # Get node names
165 |         if(is.null(rownames(data))) {
166 |             msg <- paste("No rownames provided for data matrix. Assigning integer",
167 |                          "names to nodes.\n")
168 |             warning(msg)
169 |             node_names <- as.character(c(1:nrow(data)))
170 |         } else {
171 |             node_names <- rownames(data)
172 |         }
173 |     }
174 |     
175 |     assert(
176 |         checkClass(data, "data.frame"),
177 |         checkClass(data, "matrix")
178 |     ) 
179 |     data <- as.matrix(data)
180 |     assert_matrix(data, all.missing = FALSE)
181 |     assert_that(length(node_names) == nrow(data))
182 |  
183 |     # Transform the data  
184 |     ## Get cascade ids
185 |     if(is.null(colnames(data))) {
186 |         msg <- paste("No column names provided for data. Assigning integer names",
187 |                      "to cascades.\n")
188 |         warning(msg)
189 |         cascade_ids <- as.character(c(1:ncol(data)))
190 |     } else {
191 |         cascade_ids <- colnames(data)
192 |     }
193 |     
194 |     ## Transform to cascade data structure
195 |     nona_times <- apply(data, 2, clean_casc_vec_, mode = "times", data = data,
196 |                         node_names = node_names)
197 |     nona_nodes <- apply(data, 2, clean_casc_vec_, mode = "nodes", data = data,
198 |                         node_names = node_names)
199 |     # If dim(data)[2] = 1 apply returns vector, if > 1 it returns list. Generate
200 |     # equivalent output in both cases:
201 |     if(inherits(nona_times, "matrix")) {
202 |         nona_times <- list(nona_times)
203 |         names(cascade_times) <- colnames(nona_times)
204 |         cascade_nodes <- list(as.character(nona_nodes))
205 |         names(cascade_nodes) <- colnames(nona_nodes)
206 |         
207 |     } else { # already list
208 |         cascade_nodes <- nona_nodes
209 |     }
210 |        
211 |     # Check if data is consistent
212 |     assert_cascade_consistency_(cascade_nodes, nona_times, node_names)
213 |     
214 |     out <- list("cascade_nodes" = cascade_nodes, 
215 |                 "cascade_times" = nona_times, 
216 |                 "node_names" = node_names)
217 |     class(out) <- c("cascade", "list")
218 |     out <- order_cascade_(out)
219 |     
220 |     return(out)   
221 | }
222 | 
223 | 
224 | # Clean cascade vector (remove nas and sort)
225 | clean_casc_vec_ <- function(x, mode, data, node_names) {
226 |     n <- node_names[!is.na(x)]
227 |     x <- as.numeric(x[!is.na(x)])
228 |     times <- sort(x, decreasing = TRUE)
229 |     n <- n[order(x, decreasing = TRUE)]
230 |     names(times) <- NULL
231 |     names(n) <- NULL
232 |     if(mode == "times") return(times)
233 |     else return(n)
234 | }
235 | 
236 | #' Convert a cascade object to a matrix
237 | #' 
238 | #' Generates a \code{\link{matrix}} containing the cascade information in the 
239 | #' cascade object in wide format. Missing values are used for nodes that do not 
240 | #' experience an event in a cascade.
241 | #' 
242 | #' @param x cascade object to convert.
243 | #' @param ... additional arguments to be passed to or from methods. 
244 | #'     (Currently not supported.)
245 | #' 
246 | #' @return A matrix containing all cascade information in wide format. That is,
247 | #' each row of the matrix corresponds to a node and each column to a cascade. 
248 | #' Cell entries are event times. Censored nodes have \code{NA} for their entry.
249 | #'     
250 | #' @examples
251 | #' 
252 | #' data(cascades) 
253 | #' as.matrix(cascades)
254 | #' 
255 | #' @export 
256 | as.matrix.cascade <- function(x, ...) {
257 |     
258 |     # Check inputs
259 |     assert_that(inherits(x, "cascade"))
260 |    
261 |     cids <- names(x$cascade_times) 
262 |     nids <- x$node_names
263 |     
264 |     time_lookup_ <- function(pair) {
265 |         cid <- pair[1]
266 |         nid <- pair[2]
267 |         match <- which(x$cascade_nodes[[cid]] == nid)
268 |         if(length(match) == 1) {
269 |             return(x$cascade_times[[cid]][match])
270 |         } else {
271 |             return(NA)
272 |         }
273 |     }
274 |     
275 |     combos <- expand.grid(cids, nids) 
276 |     times <- apply(combos, 1, time_lookup_)
277 |     
278 |     # Reshape to matrix
279 |     out <- matrix(times, nrow = length(nids), ncol = length(cids), byrow = TRUE)
280 |     rownames(out) <- nids
281 |     colnames(out) <- cids
282 |     
283 |     return(out)    
284 | }
285 | 
286 | #' Select a subset of cascades from cascade object
287 | #' 
288 | #' @param cascade cascade, object to select from
289 | #' @param selection character or integer, vector of cascade_ids to select
290 | #' 
291 | #' @return An object of class cascade containing just the selected cascades
292 | #' 
293 | #' @examples
294 | #' 
295 | #' data(policies)
296 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
297 | #'                             event_time = 'adopt_year', cascade_id = 'policy')
298 | #' cascade_names <- names(cascades$cascade_times)
299 | #' subset_cascade(cascades, selection = cascade_names[1:10])
300 | #' 
301 | #' @export
302 | subset_cascade <- function(cascade, selection) {
303 |     # Check inputs
304 |     assert_that(inherits(cascade, 'cascade'))
305 |     cascade_names <- names(cascade$cascade_times)
306 |     assert_that(all(selection %in% cascade_names))
307 |     
308 |     cascade_times <- cascade$cascade_times[selection]
309 |     cascade_nodes <- cascade$cascade_nodes[selection]
310 |     #node_names <- unique(do.call(c, cascade_nodes))
311 |     node_names <- cascade$node_names
312 |     out <- list(cascade_nodes = cascade_nodes, cascade_times = cascade_times,
313 |                 node_names = node_names)
314 |     class(out) <- c('cascade', 'list')
315 |     return(out)
316 | }
317 | 
318 | #' Drop nodes from a cascade object
319 | #' 
320 | #' @param cascades cascade, object to drop nodes from.
321 | #' @param nodes character or integer, vector of node_ids to drop.
322 | #' @param drop logical, Should empty cascades be dropped.
323 | #' 
324 | #' @return An object of class cascade containing the cascades without the 
325 | #'     dropped nodes.
326 | #' 
327 | #' @examples
328 | #' 
329 | #' data(policies)
330 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 
331 | #'                             event_time = 'adopt_year', cascade_id = 'policy')
332 | #' new_cascades <- drop_nodes(cascades, c("California", "New York"))
333 | #' 
334 | #' @export
335 | drop_nodes <- function(cascades, nodes, drop = TRUE) {
336 |     # Check inputs
337 |     assert_that(inherits(cascades, 'cascade'))
338 |     assert_that(all(nodes %in% cascades$node_names))
339 |     c_length <- length(cascades$cascade_nodes)
340 |     cascade_ids <- names(cascades$cascade_nodes)
341 |     
342 |     # Drop nodes from cascade_nodes, cascade_times and node_names 
343 |     drop_idxs <- lapply(cascades$cascade_nodes, function(x) which(x %in% nodes))
344 |     cascade_nodes <- lapply(1:c_length, function(i) {
345 |        cascades$cascade_nodes[[i]][-drop_idxs[[i]]] 
346 |     })
347 |     names(cascade_nodes) <- cascade_ids
348 |     cascade_times <- lapply(1:c_length, function(i) {
349 |        cascades$cascade_times[[i]][-drop_idxs[[i]]] 
350 |     })
351 |     names(cascade_times) <- cascade_ids
352 |     node_names <- cascades$node_names[!cascades$node_names %in% nodes]
353 |     
354 |     # Check for empty cascades
355 |     if(drop) {
356 |        cascade_nodes <- remove_zero_length_(cascade_nodes)
357 |        cascade_times <- remove_zero_length_(cascade_times)
358 |     }
359 |     
360 |     out <- list(cascade_nodes = cascade_nodes, cascade_times = cascade_times,
361 |                 node_names = node_names)
362 |     class(out) <- c('cascade', 'list')
363 |     return(out)
364 | }
365 | 
366 | 
367 | 
368 | #' Subset a cascade object in time
369 | #' 
370 | #' Remove each all events occurring outside the desired subset for each cascade 
371 | #' in a cascade object.
372 | #' 
373 | #' @param cascade cascade, object to subset.
374 | #' @param start_time numeric, start time of the subset.
375 | #' @param end_time numeric, end time of the subset.
376 | #' @param drop logical, should empty sub-cascades be dropped?
377 | #'
378 | #' @return An object of class cascade, where only events are included that have 
379 | #'     times \code{start_time} <= t < \code{end_time}.
380 | #'     
381 | #' @examples
382 | #' 
383 | #' data(cascades)
384 | #' sub_cascades <- subset_cascade_time(cascades, 10, 20, drop=TRUE)
385 | #' 
386 | #' @export
387 | subset_cascade_time <- function(cascade, start_time, end_time, drop=TRUE) {
388 |    # Check inputs
389 |    assert_that(inherits(cascade, 'cascade'))
390 |    qassert(start_time, 'N1')
391 |    qassert(end_time, 'N1')
392 |    qassert(drop, 'B1')
393 |     
394 |    casc_length <- length(cascade$cascade_nodes)    
395 |    subset_idxs <- lapply(cascade$cascade_times, function(x) {
396 |        out <- which(x >= start_time & x < end_time)
397 |        }) 
398 |    subset_times <- lapply(1:casc_length, function(x) {
399 |       return(cascade$cascade_times[[x]][subset_idxs[[x]]])
400 |    }) 
401 |    subset_nodes <- lapply(1:casc_length, function(x) {
402 |       return(cascade$cascade_nodes[[x]][subset_idxs[[x]]])
403 |    }) 
404 |    names(subset_times) <- names(subset_nodes) <- names(cascade$cascade_times)
405 |    if(drop) {
406 |        subset_times <- remove_zero_length_(subset_times)
407 |        subset_nodes <- remove_zero_length_(subset_nodes)
408 |    }
409 |    #subset_node_names <- unique(do.call(c, subset_nodes))
410 |    subset_node_names <- cascade$node_names
411 |    out <- list(cascade_nodes = subset_nodes, cascade_times = subset_times, 
412 |                node_names = subset_node_names)
413 |    class(out) <- c("cascade", "list")
414 |    return(out)
415 | }
416 | 
417 | # Remove vectors of length zero from a list of vectors
418 | remove_zero_length_ <- function(x) {
419 |     out <- lapply(x, function(y) if(length(y) == 0) return() else return(y))
420 |     return(Filter(Negate(is.null), out))
421 | }
422 | 
423 | #' Convert a cascade object to a data frame
424 | #' 
425 | #' Generates a data frame containing the cascade information in the cascade object.
426 | #' 
427 | #' @param x Cascade object to convert.
428 | #' @param row.names	NULL or a character vector giving the row names for the data 
429 | #'     frame. Missing values are not allowed.
430 | #' @param optional logical. If TRUE, setting row names and converting column 
431 | #'     names (to syntactic names: see make.names) is optional. (Not supported)
432 | #' @param ... Additional arguments passed to \code{\link{data.frame}}.
433 | #' 
434 | #' @return A data frame with three columns. Containing 1) The names of 
435 | #'     the nodes (\code{"node_name"}) that experience an event in each cascade, 
436 | #'     2) the event time (\code{"event_time"}) of the corresponding node, 
437 | #'     3) the cascade identifier \code{"cascade_id"}.
438 | #'     
439 | #' @examples
440 | #' 
441 | #' data(cascades)
442 | #' as.data.frame(cascades)
443 | #' 
444 | #' @export 
445 | as.data.frame.cascade <- function(x, row.names = NULL, optional = FALSE,
446 |                                   ...) {
447 |     # Check inputs
448 |     assert_that(inherits(x, "cascade"))
449 |     
450 |     # Warning for zero length cascades that will be dropped
451 |     zero_length <- sapply(x$cascade_nodes, function(y) length(y) == 0)
452 |     if(any(zero_length)) {
453 |         dropped <- names(x$cascade_nodes)[zero_length]
454 |         msg <- paste("The following cascades have no events and will be dropped: ", 
455 |                      paste(dropped, collapse = " "))
456 |         warning(msg)        
457 |     }
458 | 
459 |     # Convert
460 |     cascade_nodes <- do.call(c, x$cascade_nodes)
461 |     cascade_times <- do.call(c, x$cascade_times)
462 |     smry <- summary(x, quiet = TRUE)
463 |     ids <- apply(smry, 1, function(x) rep(x[1], each = x[2])) 
464 |     # apply generates matrix if nrow(smry) = 1 and list if > 1, generate 
465 |     # consistent output:
466 |     if(inherits(ids, "matrix")) {
467 |        cascade_ids <- as.character(ids) 
468 |     } else {
469 |        cascade_ids <- do.call(c, ids) 
470 |     }
471 |     out <- data.frame("node_name" = cascade_nodes, 'event_time' = cascade_times, 
472 |                       "cascade_id" = cascade_ids, 
473 |                       stringsAsFactors = FALSE, ...)
474 |     if(!is.null(row.names)) {
475 |         row.names(out) <- row.names
476 |     } else {
477 |         row.names(out) <- as.character(c(1:nrow(out)))
478 |     }
479 |     return(out) 
480 | }
481 | 
482 | # Sort cascades by event time
483 | # 
484 | # @param cascades, object of class cascade
485 | # 
486 | # @return An object of class cascade with each cascade (ids and times) ordered
487 | #     by event time
488 | order_cascade_ <- function(cascades) {
489 |     
490 |     casc_names <- names(cascades$cascade_times)
491 |    
492 |     sort_times <- function(x) return(cascades$cascade_times[[x]][orderings[[x]]])
493 |     sort_nodes <- function(x) return(cascades$cascade_nodes[[x]][orderings[[x]]])
494 |     
495 |     orderings <- lapply(cascades$cascade_times, order)
496 |     times <- lapply(c(1:length(cascades$cascade_times)), sort_times)
497 |     ids <- lapply(c(1:length(cascades$cascade_nodes)), sort_nodes)
498 |     
499 |     cascades$cascade_nodes <- ids    
500 |     cascades$cascade_times <- times
501 |     
502 |     names(cascades$cascade_nodes) <- casc_names
503 |     names(cascades$cascade_times) <- casc_names
504 |     
505 |     return(cascades)
506 | }
507 | 
508 | assert_cascade_consistency_ <- function(cascade_nodes, cascade_times, 
509 |                                         node_names) {
510 | 
511 |     # Check if containers for nodes and event times have same length (same number
512 |     # of cascades)
513 |     if(length(cascade_nodes) != length(cascade_times)) {
514 |        stop("cascade_nodes is not the same length as cascade_times.", 
515 |             call. = FALSE)
516 |     }
517 |     
518 |     # Check if each cascade has same length in event time and node name container
519 |     lens_ids <- sapply(cascade_nodes, length)
520 |     lens_times <- sapply(cascade_times, length)
521 |     if(any(lens_ids != lens_times)) {
522 |        stop("Corresponding elements in cascade_nodes and cascade_times are not of 
523 |        equal length.", call. = FALSE)
524 |     }
525 |     
526 |     # Check that all cascade elements are of correct class 
527 |     tids <- sapply(cascade_nodes, function(x) assert_that(is.element(class(x), 
528 |                                                           c("numeric", "character",
529 |                                                             "factor", "integer"))))
530 |     ttimes <- sapply(cascade_times, function(x) assert_that(is.element(class(x), 
531 |                                                           c("numeric", "integer"))))
532 |     if(!all(tids)) {
533 |       stop("At least one element of cascade_nodes is not of class numeric,
534 |            integer, character or factor or contains missing values.", 
535 |            call. = FALSE) 
536 |     }
537 |     if(!all(ttimes)) {
538 |       stop("At least one element of cascade_times is not of class numeric 
539 |            or contains missing values.", call. = FALSE) 
540 |     }
541 |                          
542 |     # Check consistency between node names in cascades and provided node names
543 |     unique_cascade_nodes <- unique(do.call(c, cascade_nodes))
544 |     chk <- is.element(unique_cascade_nodes, node_names)
545 |     if(!all(chk)) {
546 |         msg <- paste0("The following node(s) that occur in the cascades are ",
547 |                       "not contained in provided node_names:\n", 
548 |                       unique_cascade_nodes[!chk], "\nPlease provide the full ", 
549 |                       "list of node names.")
550 |         stop(msg, .call = FALSE)
551 |     }
552 | }


--------------------------------------------------------------------------------