├── LICENSE ├── data ├── cascades.RData ├── policies.RData ├── validation.RData └── sim_validation.RData ├── tests ├── testthat.R └── testthat │ ├── test_plot.R │ ├── test_netinf.R │ ├── test_simulation.R │ └── test_cascade.R ├── src ├── vuong_test.h ├── vuong_test.cpp ├── NetworkInference_init.c ├── distributions.h ├── distributions.cpp ├── possible_edges.h ├── netinf_utilities.h ├── RcppExports.cpp ├── netinf_utilities.cpp ├── possible_edges.cpp ├── spanning_tree.h ├── netinf.h ├── spanning_tree.cpp └── netinf.cpp ├── readme_figures ├── README-unnamed-chunk-5-1.png ├── README-unnamed-chunk-5-2.png ├── README-unnamed-chunk-9-1.png ├── README-unnamed-chunk-9-2.png └── README-unnamed-chunk-9-3.png ├── .travis.yml ├── .Rbuildignore ├── cran-comments.md ├── R ├── zzz.R ├── RcppExports.R ├── NetworkInference.R ├── summary.R ├── data.R ├── plot.R ├── netinf.R ├── simulate_cascades.R └── cascade.R ├── man ├── is.cascade.Rd ├── is.diffnet.Rd ├── count_possible_edges.Rd ├── sim_validation.Rd ├── simulate_rnd_cascades.Rd ├── subset_cascade.Rd ├── drop_nodes.Rd ├── cascades.Rd ├── subset_cascade_time.Rd ├── as.matrix.cascade.Rd ├── validation.Rd ├── as.data.frame.cascade.Rd ├── plot.diffnet.Rd ├── summary.cascade.Rd ├── plot.cascade.Rd ├── NetworkInference.Rd ├── as_cascade_wide.Rd ├── simulate_cascades.Rd ├── as_cascade_long.Rd ├── policies.Rd └── netinf.Rd ├── release_checklist.md ├── NAMESPACE ├── vignettes ├── bibliography.bib ├── quickstart_vignette.Rmd └── tutorial_vignette.Rmd ├── .gitignore ├── DESCRIPTION ├── netinf_benchmark.R ├── inst └── CITATION ├── README.Rmd ├── README.md └── NEWS.md /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017 2 | COPYRIGHT HOLDER: Fridolin Linder -------------------------------------------------------------------------------- /data/cascades.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/cascades.RData -------------------------------------------------------------------------------- /data/policies.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/policies.RData -------------------------------------------------------------------------------- /data/validation.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/validation.RData -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(NetworkInference) 3 | 4 | test_check("NetworkInference") 5 | -------------------------------------------------------------------------------- /src/vuong_test.h: -------------------------------------------------------------------------------- 1 | using namespace Rcpp; 2 | double vuong_test(NumericVector x1, NumericVector x2, bool bic=false); -------------------------------------------------------------------------------- /data/sim_validation.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/data/sim_validation.RData -------------------------------------------------------------------------------- /readme_figures/README-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /readme_figures/README-unnamed-chunk-5-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-5-2.png -------------------------------------------------------------------------------- /readme_figures/README-unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /readme_figures/README-unnamed-chunk-9-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-2.png -------------------------------------------------------------------------------- /readme_figures/README-unnamed-chunk-9-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/desmarais-lab/NetworkInference/HEAD/readme_figures/README-unnamed-chunk-9-3.png -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Sample .travis.yml for R projects 2 | 3 | language: r 4 | warnings_are_errors: true 5 | sudo: required 6 | branches: 7 | only: 8 | - master 9 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^\.travis\.yml$ 5 | cran-comments.md 6 | ^README\.Rmd$ 7 | readme_figures/ 8 | README.md 9 | release_checklist.md 10 | ^revdep$ 11 | netinf_benchmark.R 12 | vignettes/*_cache/ 13 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | Version 1.2.5 2 | 3 | Maintenance update: remove C++11 requirement per CRAN request; modernize CITATION file. 4 | 5 | ## R CMD check results 6 | 7 | * No errors, warnings, notes 8 | 9 | ## Downstream dependencies 10 | There are no downstream dependencies 11 | -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | .onAttach <- function(...) { 2 | packageStartupMessage( 3 | 'NetworkInference: Inferring latent diffusion networks. 4 | Version 1.2.4.9000 (Development Version) 5 | copyright (c) 2016, Fridolin Linder, Pennsylvania State University 6 | Bruce Desmarais, Pennsylvania State University 7 | For citation information, type citation("NetworkInference"). 8 | Type help("NetworkInference") to get started.\n' 9 | ) 10 | } 11 | -------------------------------------------------------------------------------- /tests/testthat/test_plot.R: -------------------------------------------------------------------------------- 1 | library(NetworkInference) 2 | 3 | context("Test if plotting works.") 4 | 5 | data(cascades) 6 | 7 | test_that("Plotting function runs.", { 8 | p <- plot(cascades, selection = names(cascades$cascade_nodes)[1:3]) 9 | p <- plot(cascades, label_nodes = FALSE) 10 | res <- netinf(cascades, n_edges = 6, params = 1, quiet = TRUE) 11 | #plot(res, "network") 12 | p <- plot(res, "improvement") 13 | expect_warning(plot(cascades)) 14 | }) -------------------------------------------------------------------------------- /src/vuong_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "distributions.h" 3 | 4 | using namespace Rcpp; 5 | 6 | double vuong_test(NumericVector x1, NumericVector x2, bool bic=false) { 7 | //x1: old (k edges), x2: new (k+1 edges) 8 | //x2 = x2 - (log(float(x2.size())) / (2 * float(x2.size()))); 9 | x2 = x2 - (1 / float(x2.size())); 10 | NumericVector liks = x2 - x1; 11 | double sd = Rcpp::sd(liks); 12 | double stat = sum(liks) / (sd * sqrt(float(x2.size()))); 13 | double pval = 1 - normal_cdf(stat); 14 | return pval; 15 | } -------------------------------------------------------------------------------- /man/is.cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{is.cascade} 4 | \alias{is.cascade} 5 | \title{Is the object of class cascade?} 6 | \usage{ 7 | is.cascade(object) 8 | } 9 | \arguments{ 10 | \item{object}{the object to be tested.} 11 | } 12 | \value{ 13 | \code{TRUE} if object is a cascade, \code{FALSE} otherwise. 14 | } 15 | \description{ 16 | Is the object of class cascade? 17 | } 18 | \examples{ 19 | 20 | data(cascades) 21 | is.cascade(cascades) 22 | # > TRUE 23 | is.cascade(1) 24 | # > FALSE 25 | } 26 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | netinf_ <- function(cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff) { 5 | .Call(`_NetworkInference_netinf_`, cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff) 6 | } 7 | 8 | count_possible_edges_ <- function(cascade_nodes, cascade_times, quiet = TRUE) { 9 | .Call(`_NetworkInference_count_possible_edges_`, cascade_nodes, cascade_times, quiet) 10 | } 11 | 12 | -------------------------------------------------------------------------------- /release_checklist.md: -------------------------------------------------------------------------------- 1 | * Increment version number in DESCRIPTION 2 | * Update version date in DESCRIPTION 3 | * Increment version number in R/zzz.R 4 | * Update NEWS.md 5 | * Check on winbuilder (`devtools::build_win()`) 6 | * Check locally (with valgrind) (`devtools:check(args = "--use-valgrind")`) 7 | * Push to github and wait for Travis check 8 | * Check reverse dependencies (`devtools::revdep_check()`) 9 | * Update cran-comments.md 10 | * Release on cran 11 | * Wait until released 12 | * Create Github tag 13 | * Set version number to devel in DESCRIPTION 14 | * Set version number to devel in R/zzz.R 15 | * Update News with devel version 16 | * Push to github -------------------------------------------------------------------------------- /tests/testthat/test_netinf.R: -------------------------------------------------------------------------------- 1 | library(NetworkInference) 2 | 3 | context("Test if core netinf method works") 4 | 5 | data(cascades) 6 | data(validation) 7 | test_that("netinf produces the edges as original netinf executable.", { 8 | from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential", 9 | n_edges = 5, quiet = TRUE) 10 | t1 <- from_netinf[order(from_netinf[, 1], from_netinf[, 2]), c(-3, -4)] 11 | rownames(t1) <- c(1:nrow(t1)) 12 | t2 <- validation[order(validation[, 1], validation[, 2]), -c(3:6)] 13 | rownames(t2) <- c(1:nrow(t2)) 14 | class(t2) <- c("diffnet", "data.frame") 15 | expect_equal(t1, t2) 16 | }) -------------------------------------------------------------------------------- /man/is.diffnet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/netinf.R 3 | \name{is.diffnet} 4 | \alias{is.diffnet} 5 | \title{Is the object of class diffnet?} 6 | \usage{ 7 | is.diffnet(object) 8 | } 9 | \arguments{ 10 | \item{object}{the object to be tested.} 11 | } 12 | \value{ 13 | \code{TRUE} if object is a diffnet, \code{FALSE} otherwise. 14 | } 15 | \description{ 16 | Tests if an object is of class diffnet. The class diffnet is appended to the 17 | object returned by \code{\link{netinf}} for dispatch of appropriate plotting 18 | methods. 19 | } 20 | \examples{ 21 | 22 | data(cascades) 23 | result <- netinf(cascades, n_edges = 6, params = 1) 24 | is.diffnet(result) 25 | } 26 | -------------------------------------------------------------------------------- /man/count_possible_edges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/netinf.R 3 | \name{count_possible_edges} 4 | \alias{count_possible_edges} 5 | \title{Count the number of possible edges in the dataset} 6 | \usage{ 7 | count_possible_edges(cascades) 8 | } 9 | \arguments{ 10 | \item{cascades}{Object of class cascade containing the data.} 11 | } 12 | \value{ 13 | An integer count. 14 | } 15 | \description{ 16 | Across all cascades, count the edges that are possible. An edge from node 17 | \code{u} to node \code{v} 18 | is only possible if in at least one cascade \code{u} experienced an event 19 | before \code{v}. 20 | } 21 | \examples{ 22 | data(cascades) 23 | count_possible_edges(cascades) 24 | 25 | } 26 | -------------------------------------------------------------------------------- /man/sim_validation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{sim_validation} 5 | \alias{sim_validation} 6 | \title{Larger simulated validation network.} 7 | \format{An object of class \code{data.frame} with 4 columns, containing: 8 | \describe{ 9 | \item{origin_node}{Origin of diffusion edge.} 10 | \item{destination_node}{Destination node of diffusion edge.} 11 | \item{improvement}{Improvement in score for the edge} 12 | \item{p-value}{p-value for vuong test} 13 | }} 14 | \source{ 15 | See code below. 16 | } 17 | \usage{ 18 | data(sim_validation) 19 | } 20 | \description{ 21 | A network from simulated data. For testing purposes. 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(as.data.frame,cascade) 4 | S3method(as.matrix,cascade) 5 | S3method(plot,cascade) 6 | S3method(plot,diffnet) 7 | S3method(summary,cascade) 8 | export(as_cascade_long) 9 | export(as_cascade_wide) 10 | export(count_possible_edges) 11 | export(drop_nodes) 12 | export(is.cascade) 13 | export(is.diffnet) 14 | export(netinf) 15 | export(simulate_cascades) 16 | export(simulate_rnd_cascades) 17 | export(subset_cascade) 18 | export(subset_cascade_time) 19 | import(assertthat) 20 | import(checkmate) 21 | import(ggplot2) 22 | import(ggrepel) 23 | importFrom(Rcpp,sourceCpp) 24 | importFrom(stats,density) 25 | importFrom(stats,runif) 26 | useDynLib(NetworkInference, .registration = TRUE) 27 | -------------------------------------------------------------------------------- /vignettes/bibliography.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{gomez2010inferring, 2 | title={Inferring networks of diffusion and influence}, 3 | author={Gomez Rodriguez, Manuel and Leskovec, Jure and Krause, Andreas}, 4 | booktitle={Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining}, 5 | pages={1019--1028}, 6 | year={2010}, 7 | organization={ACM} 8 | } 9 | 10 | @article{desmarais2015persistent, 11 | title={Persistent Policy Pathways: Inferring Diffusion Networks in the American States}, 12 | author={Desmarais, Bruce A and Harden, Jeffrey J and Boehmke, Frederick J}, 13 | journal={American Political Science Review}, 14 | volume={109}, 15 | number={02}, 16 | pages={392--406}, 17 | year={2015}, 18 | publisher={Cambridge Univ Press} 19 | } 20 | -------------------------------------------------------------------------------- /man/simulate_rnd_cascades.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simulate_cascades.R 3 | \name{simulate_rnd_cascades} 4 | \alias{simulate_rnd_cascades} 5 | \title{Simulate a set of random cascades} 6 | \usage{ 7 | simulate_rnd_cascades(n_cascades, n_nodes) 8 | } 9 | \arguments{ 10 | \item{n_cascades}{Number of cascades to generate.} 11 | 12 | \item{n_nodes}{Number of nodes in the system.} 13 | } 14 | \value{ 15 | A data frame containing (in order of columns) node ids, 16 | event time and cascade identifier. 17 | } 18 | \description{ 19 | Simulate random cascades, for testing and demonstration purposes. No actual 20 | diffusion model is underlying these cascades. 21 | } 22 | \examples{ 23 | 24 | df <- simulate_rnd_cascades(10, n_nodes = 20) 25 | head(df) 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/NetworkInference_init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* .Call calls */ 7 | extern SEXP _NetworkInference_count_possible_edges_(SEXP, SEXP, SEXP); 8 | extern SEXP _NetworkInference_netinf_(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, 9 | SEXP); 10 | 11 | static const R_CallMethodDef CallEntries[] = { 12 | {"_NetworkInference_count_possible_edges_", (DL_FUNC) &_NetworkInference_count_possible_edges_, 3}, 13 | {"_NetworkInference_netinf_", (DL_FUNC) &_NetworkInference_netinf_, 8}, 14 | {NULL, NULL, 0} 15 | }; 16 | 17 | void R_init_NetworkInference(DllInfo *dll) 18 | { 19 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 20 | R_useDynamicSymbols(dll, FALSE); 21 | } 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | # Example code in package build process 8 | *-Ex.R 9 | # Output files from R CMD build 10 | /*.tar.gz 11 | # Output files from R CMD check 12 | /*.Rcheck/ 13 | # RStudio files 14 | .Rproj.user/ 15 | *.Rproj 16 | # produced vignettes 17 | vignettes/*.html 18 | vignettes/*.pdf 19 | vignettes/*_cache/ 20 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 21 | .httr-oauth 22 | # knitr and R markdown default cache directories 23 | /*_cache/ 24 | /cache/ 25 | # Temporary files created by R markdown 26 | *.utf8.md 27 | *.knit.md 28 | .Rproj.user 29 | # Compiled files 30 | *.o 31 | *.so 32 | *.a 33 | # Development note 34 | devel_notes.md 35 | # Generated test data 36 | data/test_out.txt 37 | inst/doc 38 | README_old.md 39 | readme.html 40 | /revdep 41 | /revdep/.cache.rds 42 | *.swp 43 | -------------------------------------------------------------------------------- /man/subset_cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{subset_cascade} 4 | \alias{subset_cascade} 5 | \title{Select a subset of cascades from cascade object} 6 | \usage{ 7 | subset_cascade(cascade, selection) 8 | } 9 | \arguments{ 10 | \item{cascade}{cascade, object to select from} 11 | 12 | \item{selection}{character or integer, vector of cascade_ids to select} 13 | } 14 | \value{ 15 | An object of class cascade containing just the selected cascades 16 | } 17 | \description{ 18 | Select a subset of cascades from cascade object 19 | } 20 | \examples{ 21 | 22 | data(policies) 23 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 24 | event_time = 'adopt_year', cascade_id = 'policy') 25 | cascade_names <- names(cascades$cascade_times) 26 | subset_cascade(cascades, selection = cascade_names[1:10]) 27 | 28 | } 29 | -------------------------------------------------------------------------------- /man/drop_nodes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{drop_nodes} 4 | \alias{drop_nodes} 5 | \title{Drop nodes from a cascade object} 6 | \usage{ 7 | drop_nodes(cascades, nodes, drop = TRUE) 8 | } 9 | \arguments{ 10 | \item{cascades}{cascade, object to drop nodes from.} 11 | 12 | \item{nodes}{character or integer, vector of node_ids to drop.} 13 | 14 | \item{drop}{logical, Should empty cascades be dropped.} 15 | } 16 | \value{ 17 | An object of class cascade containing the cascades without the 18 | dropped nodes. 19 | } 20 | \description{ 21 | Drop nodes from a cascade object 22 | } 23 | \examples{ 24 | 25 | data(policies) 26 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 27 | event_time = 'adopt_year', cascade_id = 'policy') 28 | new_cascades <- drop_nodes(cascades, c("California", "New York")) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/cascades.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{cascades} 5 | \alias{cascades} 6 | \title{Example cascades} 7 | \format{An object of class \code{cascade} containing 4 objects 8 | \describe{ 9 | \item{node_names}{Character node names} 10 | \item{cascade_nodes}{A list of integer vectors. Each containing the names of the 11 | nodes infected in this cascades in the order of infection} 12 | \item{cascade_times}{A list of numeric vectors. Each containing the infection 13 | times for the corresponding nodes in cascade_nodes} 14 | }} 15 | \source{ 16 | \url{https://github.com/snap-stanford/snap/blob/master/examples/netinf/example-cascades.txt} 17 | } 18 | \usage{ 19 | data(cascades) 20 | } 21 | \description{ 22 | An example dataset of 31 nodes and 54 cascades. From the original netinf 23 | implementation in SNAP. 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /src/distributions.h: -------------------------------------------------------------------------------- 1 | using namespace Rcpp; 2 | 3 | /** 4 | * Exponential density 5 | * 6 | * @param x Value to evaluate. 7 | * @param lambda Rate paramter of the distribution. 8 | * 9 | * @return Density value 10 | */ 11 | double dexp_(double x, double lambda); 12 | 13 | /** 14 | * Rayleigh density 15 | * 16 | * @param x Value to evaluate. 17 | * @param lambda shape paramter of the distribution. 18 | * 19 | * @return Density value 20 | */ 21 | double drayleigh_(double x, double lambda); 22 | 23 | /** 24 | * Log-normal density 25 | * 26 | * @param x Value to evaluate. 27 | * @param mu mean 28 | * @param sigma variance 29 | * 30 | * @return Density value 31 | */ 32 | double dlognormal_(double x, double mu, double sigma); 33 | 34 | /** 35 | * Cumulative distribution function of the standard normal distribution 36 | * 37 | * @param x Value to evaluate. 38 | * 39 | * @return Probability of X > x. 40 | */ 41 | double normal_cdf(double x); 42 | -------------------------------------------------------------------------------- /man/subset_cascade_time.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{subset_cascade_time} 4 | \alias{subset_cascade_time} 5 | \title{Subset a cascade object in time} 6 | \usage{ 7 | subset_cascade_time(cascade, start_time, end_time, drop = TRUE) 8 | } 9 | \arguments{ 10 | \item{cascade}{cascade, object to subset.} 11 | 12 | \item{start_time}{numeric, start time of the subset.} 13 | 14 | \item{end_time}{numeric, end time of the subset.} 15 | 16 | \item{drop}{logical, should empty sub-cascades be dropped?} 17 | } 18 | \value{ 19 | An object of class cascade, where only events are included that have 20 | times \code{start_time} <= t < \code{end_time}. 21 | } 22 | \description{ 23 | Remove each all events occurring outside the desired subset for each cascade 24 | in a cascade object. 25 | } 26 | \examples{ 27 | 28 | data(cascades) 29 | sub_cascades <- subset_cascade_time(cascades, 10, 20, drop=TRUE) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /man/as.matrix.cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{as.matrix.cascade} 4 | \alias{as.matrix.cascade} 5 | \title{Convert a cascade object to a matrix} 6 | \usage{ 7 | \method{as.matrix}{cascade}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{cascade object to convert.} 11 | 12 | \item{...}{additional arguments to be passed to or from methods. 13 | (Currently not supported.)} 14 | } 15 | \value{ 16 | A matrix containing all cascade information in wide format. That is, 17 | each row of the matrix corresponds to a node and each column to a cascade. 18 | Cell entries are event times. Censored nodes have \code{NA} for their entry. 19 | } 20 | \description{ 21 | Generates a \code{\link{matrix}} containing the cascade information in the 22 | cascade object in wide format. Missing values are used for nodes that do not 23 | experience an event in a cascade. 24 | } 25 | \examples{ 26 | 27 | data(cascades) 28 | as.matrix(cascades) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /man/validation.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{validation} 5 | \alias{validation} 6 | \title{Validation output from netinf source.} 7 | \format{An object of class \code{data.frame} with 6 columns, containing: 8 | \describe{ 9 | \item{origin_node}{Origin of diffusion edge.} 10 | \item{destination_node}{Destination node of diffusion edge.} 11 | \item{volume}{??} 12 | \item{marginal_gain}{Marginal gain from edge.} 13 | \item{median_time_difference}{Median time between events in origin and 14 | destination} 15 | \item{mean_time_difference}{Mean time between events in origin and 16 | destination} 17 | }} 18 | \source{ 19 | Output from netinf example program (\url{https://github.com/snap-stanford/snap/tree/master/examples/netinf}). 20 | } 21 | \usage{ 22 | data(validation) 23 | } 24 | \description{ 25 | Contains output from original netinf C++ implementation, executed on 26 | \code{\link{cascades}}. For testing purposes. 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: NetworkInference 2 | Type: Package 3 | Title: Inferring Latent Diffusion Networks 4 | Version: 1.2.5 5 | Date: 2025-11-28 6 | Authors@R: c(person("Fridolin", "Linder", email = "fridolin.linder@gmail.com", 7 | role = c("aut", "cre")), 8 | person("Bruce", "Desmarais", role = "ctb")) 9 | Description: This is an R implementation of the netinf algorithm (Gomez Rodriguez, Leskovec, and Krause, 2010). Given a set of events that spread between a set of nodes the algorithm infers the most likely stable diffusion network that is underlying the diffusion process. 10 | License: MIT + file LICENSE 11 | Imports: 12 | Rcpp (>= 0.12.5), 13 | assertthat, 14 | checkmate, 15 | ggplot2, 16 | ggrepel, 17 | stats 18 | LinkingTo: 19 | Rcpp, 20 | RcppProgress 21 | BugReports: https://github.com/desmarais-lab/NetworkInference/issues 22 | Suggests: 23 | testthat, 24 | knitr, 25 | rmarkdown, 26 | pander, 27 | igraph, 28 | utils, 29 | dplyr 30 | RoxygenNote: 6.1.1 31 | LazyData: true 32 | VignetteBuilder: knitr 33 | -------------------------------------------------------------------------------- /man/as.data.frame.cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{as.data.frame.cascade} 4 | \alias{as.data.frame.cascade} 5 | \title{Convert a cascade object to a data frame} 6 | \usage{ 7 | \method{as.data.frame}{cascade}(x, row.names = NULL, optional = FALSE, 8 | ...) 9 | } 10 | \arguments{ 11 | \item{x}{Cascade object to convert.} 12 | 13 | \item{row.names}{NULL or a character vector giving the row names for the data 14 | frame. Missing values are not allowed.} 15 | 16 | \item{optional}{logical. If TRUE, setting row names and converting column 17 | names (to syntactic names: see make.names) is optional. (Not supported)} 18 | 19 | \item{...}{Additional arguments passed to \code{\link{data.frame}}.} 20 | } 21 | \value{ 22 | A data frame with three columns. Containing 1) The names of 23 | the nodes (\code{"node_name"}) that experience an event in each cascade, 24 | 2) the event time (\code{"event_time"}) of the corresponding node, 25 | 3) the cascade identifier \code{"cascade_id"}. 26 | } 27 | \description{ 28 | Generates a data frame containing the cascade information in the cascade object. 29 | } 30 | \examples{ 31 | 32 | data(cascades) 33 | as.data.frame(cascades) 34 | 35 | } 36 | -------------------------------------------------------------------------------- /man/plot.diffnet.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{plot.diffnet} 4 | \alias{plot.diffnet} 5 | \title{Visualize netinf output} 6 | \usage{ 7 | \method{plot}{diffnet}(x, type = "network", ...) 8 | } 9 | \arguments{ 10 | \item{x}{object of class diffnet to be plotted.} 11 | 12 | \item{type}{character, one of \code{c("network", "improvement", "p-value")} 13 | indicating if the inferred diffusion network, the 14 | improvement for each edge or the p-value from the vuong test for each 15 | edge should be visualized .} 16 | 17 | \item{...}{additional arguments.} 18 | } 19 | \value{ 20 | A ggplot plot object if \code{type = "improvement"} otherwise an 21 | igraph plot. 22 | } 23 | \description{ 24 | Visualize the inferred diffusion network or the marginal gain in fit obtained 25 | by addition of each edge. 26 | } 27 | \details{ 28 | If `type = improvement` a ggplot object is returned. It can be modified like 29 | any other ggplot. See the ggplot documentation and the examples in 30 | \link{plot.cascade}. 31 | } 32 | \examples{ 33 | 34 | \dontrun{ 35 | data(cascades) 36 | res <- netinf(cascades, quiet = TRUE) 37 | plot(res, type = "network") 38 | plot(res, type = "improvement") 39 | plot(res, type = "p-value") 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/distributions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | using namespace Rcpp; 4 | 5 | double dexp_(double x, double lambda) { 6 | return lambda * std::exp(-1 * lambda * x); 7 | } 8 | 9 | double drayleigh_(double x, double lambda) { 10 | return (x / pow(lambda, 2)) * std::exp(-pow(x, 2) / (2 * pow(lambda, 2))); 11 | } 12 | 13 | double dlognormal_(double x, double mu, double sigma) { 14 | if(x <= 0) { 15 | std::string msg = "x outside support of log-normal distribution.\n"; 16 | throw std::invalid_argument(msg); 17 | } 18 | return 1 / (x*sigma*sqrt(2*M_PI)) * 19 | std::exp(-(pow((log(x) - mu), 2)/(2*pow(sigma, 2)))); 20 | } 21 | 22 | // https://www.johndcook.com/blog/cpp_phi/ 23 | double normal_cdf(double x) { 24 | // constants 25 | double a1 = 0.254829592; 26 | double a2 = -0.284496736; 27 | double a3 = 1.421413741; 28 | double a4 = -1.453152027; 29 | double a5 = 1.061405429; 30 | double p = 0.3275911; 31 | 32 | // Save the sign of x 33 | int sign = 1; 34 | if (x < 0) 35 | sign = -1; 36 | x = fabs(x)/sqrt(2.0); 37 | 38 | // A&S formula 7.1.26 39 | double t = 1.0/(1.0 + p*x); 40 | double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x); 41 | 42 | return 0.5*(1.0 + sign*y); 43 | } 44 | -------------------------------------------------------------------------------- /netinf_benchmark.R: -------------------------------------------------------------------------------- 1 | library(microbenchmark) 2 | devtools::load_all() 3 | 4 | cs = c(10, 50, 100, 500) 5 | ns = c(5, 10, 50, 100) 6 | params = data.frame(expand.grid(cs, ns), t = NA) 7 | set.seed(123) 8 | for(i in 1:nrow(params)) { 9 | df <- simulate_rnd_cascades(params[i, 1], n_nodes = params[i, 2]) 10 | cascades <- as_cascade_long(df, node_names = unique(df$node_name)) 11 | bm = microbenchmark('test' = netinf(cascades, trans_mod = "exponential", 12 | n_edges = 10, params = 1, 13 | quiet = T), times = 50) 14 | netinf(cascades, max_iter = 1, params = 10) 15 | t = mean(bm$time) * 1e-6 16 | params[i, 3] = t 17 | } 18 | 19 | # Current performance (4a506800389c2047c09eead95bc0b3224a2c0654) 20 | Var1 Var2 t 21 | 1 10 5 1.1869810 22 | 2 50 5 0.7875174 23 | 3 100 5 1.4549489 24 | 4 500 5 5.9649529 25 | 5 10 10 0.7451685 26 | 6 50 10 1.5072754 27 | 7 100 10 3.5749858 28 | 8 500 10 12.8299009 29 | 9 10 50 7.3565341 30 | 10 50 50 22.9769044 31 | 11 100 50 64.7670850 32 | 12 500 50 271.0306328 33 | 13 10 100 19.7277996 34 | 14 50 100 113.3796667 35 | 15 100 100 272.5178511 36 | 16 500 100 1712.7072782 37 | -------------------------------------------------------------------------------- /src/possible_edges.h: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | typedef std::array id_array; 5 | typedef std::pair, double> edge_value; 6 | typedef std::map edge_map; 7 | 8 | /** 9 | * Finds all possible edges from a set of cascades 10 | * 11 | * @param cascade_nodes A list of integer vectors containing the nodes involved 12 | * in each cascade in order of infection 13 | * @param cascade_times A list of numeric vectors containing the infection times 14 | * of each node in the corresponding vector in cascade_nodes 15 | * @param quiet Should progress be reported 16 | * 17 | * @return edge_map with all possible edges 18 | */ 19 | edge_map get_possible_edges_(List &cascade_nodes, List &cascade_times, 20 | bool &quiet); 21 | 22 | /** 23 | * Wrapper to count possible edges (for rcpp export) 24 | * @param cascade_nodes A list of integer vectors containing the nodes involved 25 | * in each cascade in order of infection 26 | * @param cascade_times A list of numeric vectors containing the infection times 27 | * of each node in the corresponding vector in cascade_nodes 28 | * @param quiet Should progress be reported 29 | * 30 | * @return integer number of possible edges 31 | */ 32 | int count_possible_edges_(List &cascade_nodes, List &cascade_times, 33 | bool &quiet); -------------------------------------------------------------------------------- /src/netinf_utilities.h: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace Rcpp; 3 | 4 | /** 5 | * Type definitions 6 | */ 7 | typedef std::chrono::system_clock Clock; 8 | typedef std::chrono::system_clock::time_point time_point; 9 | typedef std::chrono::duration time_duration; 10 | 11 | 12 | /** 13 | * Find the position of an (first) integer in a vector 14 | * 15 | * @param x Vector to search in. 16 | * @param val Value to search for. 17 | * 18 | * @return Integer index of location of val in x or -1 if val not in x. 19 | */ 20 | int get_index(IntegerVector x, int val); 21 | 22 | /** 23 | * Sum all elements of a vector, skipping na values 24 | * 25 | * @param x Vector to sum over 26 | */ 27 | double sum_vector(NumericVector x); 28 | 29 | /** 30 | * Copy a numeric vector 31 | */ 32 | NumericVector copy_vector(NumericVector x); 33 | 34 | /** 35 | * Print the estimated estimation time in legible units 36 | * 37 | * @param fp_ms duration of inference of a single edge in milliseconds. 38 | * @param auto_edges Does the algorithm run in auto mode (no fixed number of 39 | * edges). 40 | * @param n_edges Number of edges to infer (only relevant if auto_edges=false). 41 | */ 42 | void print_time_estimate(std::chrono::duration fp_ms, 43 | bool auto_edges, int n_edges); 44 | 45 | 46 | time_point print_timing(time_point start_time, std::string step); -------------------------------------------------------------------------------- /inst/CITATION: -------------------------------------------------------------------------------- 1 | citHeader("To cite NetworkInference in publications use:") 2 | 3 | bibentry( 4 | bibtype = "Misc", 5 | key = "NetworkInference", 6 | title = "{NetworkInference}: Inferring latent diffusion networks", 7 | author = c( 8 | person("Fridolin", "Linder"), 9 | person("Bruce A.", "Desmarais") 10 | ), 11 | year = "2017", 12 | url = "https://github.com/desmarais-lab/NetworkInference" 13 | ) 14 | 15 | bibentry( 16 | bibtype = "Misc", 17 | key = "BoehmkeNSF", 18 | title = "Collaborative Research: An Expanded Framework for Inferring Public Policy Diffusion Networks.", 19 | author = c( 20 | person("Frederick J.", "Boehmke"), 21 | person("Bruce A.", "Desmarais"), 22 | person("Jeffrey J.", "Harden"), 23 | person("Hanna M.", "Wallach") 24 | ), 25 | journal = "National Science Foundation", 26 | volume = "SES Awards: 1558661, 1558781, 1558561, and 1558509" 27 | ) 28 | 29 | bibentry( 30 | bibtype = "Article", 31 | key = "BoehmkeSPID", 32 | title = "A New Database for Inferring Public Policy Innovativeness and Diffusion Networks.", 33 | author = c( 34 | person("Frederick J.", "Boehmke"), 35 | person("Mark D.", "Brockway"), 36 | person("Bruce A.", "Desmarais"), 37 | person("Jeffrey J.", "Harden"), 38 | person("Scott J.", "LaCombe"), 39 | person("Fridolin", "Linder"), 40 | person("Hanna M.", "Wallach") 41 | ), 42 | journal = "Presented at the 2017 Midwest Political Science Association Annual Conference", 43 | year = "2017" 44 | ) 45 | 46 | -------------------------------------------------------------------------------- /man/summary.cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/summary.R 3 | \name{summary.cascade} 4 | \alias{summary.cascade} 5 | \title{Summarize a cascade object} 6 | \usage{ 7 | \method{summary}{cascade}(object, quiet = FALSE, ...) 8 | } 9 | \arguments{ 10 | \item{object}{object of class cascade to be summarized.} 11 | 12 | \item{quiet}{logical, if \code{FALSE} summary stats are printed to std out.} 13 | 14 | \item{...}{Additional arguments passed to summary.} 15 | } 16 | \value{ 17 | Prints cascade summary information to the screen 18 | (if \code{quiet = FALSE}). \code{'# cascades'} is the number of cascades in 19 | the object, \code{'# nodes'} is the number of nodes in the system (nodes 20 | that can theoretically experience an event), \code{'# nodes in cascades'} is 21 | the number of unique nodes of the system that experienced an event and 22 | \code{'# possible edges'} is the number of edges that are possible given 23 | the cascade data (see \code{\link{count_possible_edges}} for details.). 24 | 25 | Additional summaries for each cascade are returned invisibly. 26 | cascade), \code{length} (length of the cascade as an integer of how many 27 | nodes experienced and event) and \code{n_ties} (number of tied event 28 | times per cascade). 29 | } 30 | \description{ 31 | Generates summary statistics for single cascades and across cascades in a 32 | collection, contained in a cascades object. 33 | } 34 | \examples{ 35 | data(cascades) 36 | summary(cascades) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /man/plot.cascade.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot.R 3 | \name{plot.cascade} 4 | \alias{plot.cascade} 5 | \title{Plot a cascade object} 6 | \usage{ 7 | \method{plot}{cascade}(x, label_nodes = TRUE, selection = NULL, ...) 8 | } 9 | \arguments{ 10 | \item{x}{object of class cascade to be plotted.} 11 | 12 | \item{label_nodes}{logical, indicating if should the nodes in each cascade be 13 | labeled. If the cascades are very dense setting this to \code{FALSE} is 14 | recommended.} 15 | 16 | \item{selection}{a vector of cascade ids to plot.} 17 | 18 | \item{...}{additional arguments passed to plot.} 19 | } 20 | \value{ 21 | A ggplot plot object. 22 | } 23 | \description{ 24 | Allows plotting of one or multiple, labeled or unlabeled cascades. 25 | } 26 | \details{ 27 | The function returns a ggplot plot object (class \code{gg, ggplot}) which 28 | can be modified like any other ggplot. See the ggplot documentation and the 29 | examples below for more details. 30 | } 31 | \examples{ 32 | 33 | data(cascades) 34 | plot(cascades, selection = names(cascades$cascade_nodes)[1:5]) 35 | plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20)) 36 | 37 | # Modify resulting ggplot object 38 | library(ggplot2) 39 | p <- plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20)) 40 | ## Add a title 41 | p <- p + ggtitle('Your Title') 42 | p 43 | ## Change Axis 44 | p <- p + xlab("Your modified y axis label") #x and y labels are flipped here 45 | p <- p + ylab("Your modified x axis label") #x and y labels are flipped here 46 | p 47 | 48 | } 49 | -------------------------------------------------------------------------------- /tests/testthat/test_simulation.R: -------------------------------------------------------------------------------- 1 | #library(NetworkInference) 2 | # 3 | #context("Test if simulation methods works") 4 | # 5 | #if (requireNamespace("igraph", quietly = TRUE)) { 6 | # data(cascades) 7 | # test_that("Simulation function works.", { 8 | # from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential", 9 | # n_edges = 5, quiet = TRUE) 10 | # set.seed(123) 11 | # out <- simulate_cascades(from_netinf, nsim = 100, max_time = 10) 12 | # casc <- as_cascade_long(out) 13 | # rec <- netinf(casc, params = 1, trans_mod = "exponential", n_edges = 5, 14 | # quiet = TRUE) 15 | # rec <- rec[order(as.numeric(rec[, 1])), ] 16 | # rec <- rec[order(as.numeric(rec[, 2])), ] 17 | # from_netinf <- from_netinf[order(as.numeric(from_netinf[, 1])), ] 18 | # from_netinf <- from_netinf[order(as.numeric(from_netinf[, 2])), ] 19 | # rownames(rec) <- rownames(from_netinf) <- NULL 20 | # expect_equal(from_netinf[, c(-3, -4)], rec[, c(-3, -4)]) 21 | # }) 22 | # test_that("Simulation function with partial cascade works.", { 23 | # partial_cascade <- cascades 24 | # partial_cascade$cascade_nodes <- cascades$cascade_nodes[10] 25 | # partial_cascade$cascade_times <- cascades$cascade_times[10] 26 | # from_netinf <- netinf(cascades, params = 1, trans_mod = "exponential", 27 | # n_edges = 100, quiet = TRUE) 28 | # out <- simulate_cascades(from_netinf, nsim = 100, max_time = 10, 29 | # partial_cascade = partial_cascade) 30 | # expect_equal(length(unique(out[out$node_name == "22", 2])), 1) 31 | # expect_equal(length(unique(out[out$node_name == "28", 2])), 1) 32 | # }) 33 | #} 34 | -------------------------------------------------------------------------------- /R/NetworkInference.R: -------------------------------------------------------------------------------- 1 | #' NetworkInference: Inferring latent diffusion networks 2 | #' 3 | #' This package provides an R implementation of the \code{netinf} algorithm 4 | #' created by Gomez Rodriguez, Leskovec, and Krause (2010). Given a set of 5 | #' events that spread between a set of nodes the algorithm infers the most likely 6 | #' stable diffusion network that is underlying the diffusion process. 7 | #' 8 | #' The package provides three groups of functions: 1) data preparation 9 | #' 2) estimation and 3) interpretation. 10 | #' 11 | #' @section Data preparation: 12 | #' 13 | #' The core estimation function \code{\link{netinf}} requires an object of class 14 | #' \code{cascade} (see \link{as_cascade_long} and \link{as_cascade_wide}). 15 | #' Cascade data contains information on the potential nodes in the network as 16 | #' well as on event times for each node in each cascade. 17 | #' 18 | #' @section Estimation: 19 | #' 20 | #' Diffusion networks are estimated using the \code{\link{netinf}} function. It 21 | #' produces a diffusion network in form of an edgelist (of class 22 | #' \code{\link{data.frame}}). 23 | #' 24 | #' @section Interpretation and Visualization: 25 | #' 26 | #' Cascade data can be visualized with the \code{plot} method of the \code{cascade} 27 | #' class (\code{diffnet, \link{plot.cascade}}). Results of the estimation process can 28 | #' be visualized using the plotting method of the \code{diffnet} class. 29 | #' 30 | #' @section Performance: 31 | #' 32 | #' If higher performance is required and for very large data sets, a faster pure C++ 33 | #' implementation is available in the Stanford Network Analysis Project (SNAP). 34 | #' The software can be downloaded at \url{http://snap.stanford.edu/netinf/}. 35 | #' 36 | #' @useDynLib NetworkInference, .registration = TRUE 37 | #' @importFrom Rcpp sourceCpp 38 | #' @name NetworkInference 39 | NULL -------------------------------------------------------------------------------- /man/NetworkInference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/NetworkInference.R 3 | \name{NetworkInference} 4 | \alias{NetworkInference} 5 | \title{NetworkInference: Inferring latent diffusion networks} 6 | \description{ 7 | This package provides an R implementation of the \code{netinf} algorithm 8 | created by Gomez Rodriguez, Leskovec, and Krause (2010). Given a set of 9 | events that spread between a set of nodes the algorithm infers the most likely 10 | stable diffusion network that is underlying the diffusion process. 11 | } 12 | \details{ 13 | The package provides three groups of functions: 1) data preparation 14 | 2) estimation and 3) interpretation. 15 | } 16 | \section{Data preparation}{ 17 | 18 | 19 | The core estimation function \code{\link{netinf}} requires an object of class 20 | \code{cascade} (see \link{as_cascade_long} and \link{as_cascade_wide}). 21 | Cascade data contains information on the potential nodes in the network as 22 | well as on event times for each node in each cascade. 23 | } 24 | 25 | \section{Estimation}{ 26 | 27 | 28 | Diffusion networks are estimated using the \code{\link{netinf}} function. It 29 | produces a diffusion network in form of an edgelist (of class 30 | \code{\link{data.frame}}). 31 | } 32 | 33 | \section{Interpretation and Visualization}{ 34 | 35 | 36 | Cascade data can be visualized with the \code{plot} method of the \code{cascade} 37 | class (\code{diffnet, \link{plot.cascade}}). Results of the estimation process can 38 | be visualized using the plotting method of the \code{diffnet} class. 39 | } 40 | 41 | \section{Performance}{ 42 | 43 | 44 | If higher performance is required and for very large data sets, a faster pure C++ 45 | implementation is available in the Stanford Network Analysis Project (SNAP). 46 | The software can be downloaded at \url{http://snap.stanford.edu/netinf/}. 47 | } 48 | 49 | -------------------------------------------------------------------------------- /vignettes/quickstart_vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NetworkInference: Quick Start Guide" 3 | author: "Fridolin Linder" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{NetworkInference: Quick Start Guide} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | --- 13 | 14 | # Introduction 15 | 16 | --- 17 | 18 | This package provides an R implementation of the netinf algorithm 19 | created by @gomez2010inferring (see [here](http://snap.stanford.edu/netinf/) 20 | for more information and the original C++ implementation). Given a set of events 21 | that spread between a set of nodes the algorithm infers the most likely stable 22 | diffusion network that is underlying the diffusion process. 23 | 24 | --- 25 | 26 | # Installation 27 | 28 | --- 29 | 30 | The package can be installed from [CRAN](https://CRAN.R-project.org/): 31 | ```{r, eval=FALSE} 32 | install.packages("NetworkInference") 33 | ``` 34 | 35 | The latest development version can be installed from 36 | [github](https://github.com/desmarais-lab/NetworkInference): 37 | ```{r, eval=FALSE} 38 | #install.packages(devtools) 39 | devtools::install_github('desmarais-lab/NetworkInference') 40 | ``` 41 | 42 | --- 43 | 44 | # Quick start guide 45 | 46 | --- 47 | 48 | To get started, get your data into the `cascades` format required by the `netinf` 49 | function: 50 | 51 | ```{r, results='hide', message=FALSE} 52 | library(NetworkInference) 53 | 54 | # Simulate random cascade data 55 | df <- simulate_rnd_cascades(50, n_node = 20) 56 | 57 | # Cast data into `cascades` object 58 | ## From long format 59 | cascades <- as_cascade_long(df) 60 | 61 | ## From wide format 62 | df_matrix <- as.matrix(cascades) ### Create example matrix 63 | cascades <- as_cascade_wide(df_matrix) 64 | ``` 65 | 66 | Then fit the model: 67 | ```{r} 68 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05) 69 | ``` 70 | 71 | ```{r, eval=FALSE} 72 | head(result) 73 | ``` 74 | ```{r, results="asis", echo=FALSE} 75 | pander::pandoc.table(head(result)) 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // netinf_ 9 | List netinf_(List& cascade_nodes, List& cascade_times, int& n_edges, std::string& model, NumericVector& params, bool quiet, bool& auto_edges, double& cutoff); 10 | RcppExport SEXP _NetworkInference_netinf_(SEXP cascade_nodesSEXP, SEXP cascade_timesSEXP, SEXP n_edgesSEXP, SEXP modelSEXP, SEXP paramsSEXP, SEXP quietSEXP, SEXP auto_edgesSEXP, SEXP cutoffSEXP) { 11 | BEGIN_RCPP 12 | Rcpp::RObject rcpp_result_gen; 13 | Rcpp::RNGScope rcpp_rngScope_gen; 14 | Rcpp::traits::input_parameter< List& >::type cascade_nodes(cascade_nodesSEXP); 15 | Rcpp::traits::input_parameter< List& >::type cascade_times(cascade_timesSEXP); 16 | Rcpp::traits::input_parameter< int& >::type n_edges(n_edgesSEXP); 17 | Rcpp::traits::input_parameter< std::string& >::type model(modelSEXP); 18 | Rcpp::traits::input_parameter< NumericVector& >::type params(paramsSEXP); 19 | Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); 20 | Rcpp::traits::input_parameter< bool& >::type auto_edges(auto_edgesSEXP); 21 | Rcpp::traits::input_parameter< double& >::type cutoff(cutoffSEXP); 22 | rcpp_result_gen = Rcpp::wrap(netinf_(cascade_nodes, cascade_times, n_edges, model, params, quiet, auto_edges, cutoff)); 23 | return rcpp_result_gen; 24 | END_RCPP 25 | } 26 | // count_possible_edges_ 27 | int count_possible_edges_(List& cascade_nodes, List& cascade_times, bool quiet); 28 | RcppExport SEXP _NetworkInference_count_possible_edges_(SEXP cascade_nodesSEXP, SEXP cascade_timesSEXP, SEXP quietSEXP) { 29 | BEGIN_RCPP 30 | Rcpp::RObject rcpp_result_gen; 31 | Rcpp::RNGScope rcpp_rngScope_gen; 32 | Rcpp::traits::input_parameter< List& >::type cascade_nodes(cascade_nodesSEXP); 33 | Rcpp::traits::input_parameter< List& >::type cascade_times(cascade_timesSEXP); 34 | Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); 35 | rcpp_result_gen = Rcpp::wrap(count_possible_edges_(cascade_nodes, cascade_times, quiet)); 36 | return rcpp_result_gen; 37 | END_RCPP 38 | } 39 | -------------------------------------------------------------------------------- /man/as_cascade_wide.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{as_cascade_wide} 4 | \alias{as_cascade_wide} 5 | \title{Transform wide data to cascade} 6 | \usage{ 7 | as_cascade_wide(data, node_names = NULL) 8 | } 9 | \arguments{ 10 | \item{data}{\link{data.frame} or \link{matrix}, rows corresponding to nodes, 11 | columns to cascades. Matrix entries are the event times for each node, 12 | cascade pair. Missing values indicate censored observations, that is, 13 | nodes that did not have an event). Specify column and row names if 14 | cascade and node ids other than integer sequences are desired. Note that, 15 | if the time column is of class date or any other special time class, it 16 | will be converted to an integer with `as.numeric()`.} 17 | 18 | \item{node_names}{character, factor or numeric vector, containing names for each node. 19 | Optional. If not provided, node names are inferred from the provided data.} 20 | } 21 | \value{ 22 | An object of class \code{cascade}. This is a list containing three 23 | (named) elements: 24 | \enumerate{ 25 | \item \code{"node_names"} A character vector of node names. 26 | \item \code{"cascade_nodes"} A list with one character vector per 27 | cascade containing the node names in order of the events. 28 | \item \code{"cascade_times"} A list with one element per cascade 29 | containing the event times for the nodes in \code{"cascade_names"}. 30 | } 31 | } 32 | \description{ 33 | Create a cascade object from data in wide format. 34 | } 35 | \details{ 36 | If data is in wide format, each row corresponds to a node and each column to 37 | a cascade. Each cell indicates the event time for a node - cascade 38 | combination. If a node did not experience an event for a cascade (the node 39 | is censored) the cell entry must be \code{NA}. 40 | } 41 | \examples{ 42 | 43 | data("policies") 44 | cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 45 | event_time = 'adopt_year', cascade_id = 'policy') 46 | wide_policies = as.matrix(cascades) 47 | cascades <- as_cascade_wide(wide_policies) 48 | is.cascade(cascades) 49 | 50 | } 51 | -------------------------------------------------------------------------------- /src/netinf_utilities.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "netinf_utilities.h" 8 | 9 | using namespace Rcpp; 10 | 11 | 12 | NumericVector init_parameters(NumericVector &cascade_times, std::string model) { 13 | NumericVector out; 14 | return out; 15 | } 16 | 17 | 18 | int get_index(IntegerVector x, int val) { 19 | for(int i = 0; i < x.size(); i++) { 20 | if(x[i] == val) return i; 21 | } 22 | return -1; 23 | } 24 | 25 | double sum_vector(NumericVector x) { 26 | double out = 0; 27 | for(int i = 0; i < x.size(); i++) { 28 | if(std::isnan(x[i])) continue; 29 | out += x[i]; 30 | } 31 | return out; 32 | } 33 | 34 | NumericVector copy_vector(NumericVector x) { 35 | NumericVector out(x.size()); 36 | for(int i = 0; i < x.size(); i++) out[i] = x[i]; 37 | return out; 38 | } 39 | 40 | void print_time_estimate(std::chrono::duration fp_ms, 41 | bool auto_edges, int n_edges) { 42 | float estimate; 43 | std::string message; 44 | if(auto_edges) { 45 | estimate = fp_ms.count(); 46 | message = "Estimated time per edge: "; 47 | } else { 48 | estimate = fp_ms.count() * n_edges; 49 | message = "Estimated completion time: "; 50 | } 51 | std::string unit = "milliseconds"; 52 | if ((estimate > 1000) & (estimate < 60000)) { 53 | estimate /= 1000; 54 | unit = "seconds"; 55 | } else if ((estimate > 60000) & (estimate < 3600000)) { 56 | estimate /= 60000; 57 | unit = "minutes"; 58 | } else if ((estimate > 3600000) & (estimate < 86400000)) { 59 | estimate /= 3600000; 60 | unit = "hours"; 61 | } else if (estimate > 86400000) { 62 | estimate /= 86400000; 63 | unit = "days"; 64 | } 65 | float out = roundf(estimate * 100) / 100; 66 | Rcout << message << out << " " << unit << ".\n"; 67 | } 68 | 69 | time_point print_timing(time_point start_time, std::string step) { 70 | time_point t2 = Clock::now(); 71 | time_duration fp_ms = t2 - start_time; 72 | Rcout << step << "took: " << fp_ms.count() << "ms\n"; 73 | return t2; 74 | } -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "readme_figures/README-" 12 | ) 13 | ``` 14 | 15 | # NetworkInference: Inferring Latent Diffusion Networks 16 | 17 | ![](https://travis-ci.org/desmarais-lab/NetworkInference.svg) ![](http://www.r-pkg.org/badges/version/NetworkInference) 18 | ![](http://cranlogs.r-pkg.org/badges/NetworkInference) 19 | ![](http://cranlogs.r-pkg.org/badges/grand-total/NetworkInference?color=yellow) 20 | 21 | ## About 22 | 23 | This package provides an R implementation of the netinf algorithm 24 | created by Gomez-Rodriguez, Leskovec, and Krause (see 25 | [here](http://snap.stanford.edu/netinf/) for more information and the original 26 | C++ implementation). Given a set of events that spread between a set of nodes 27 | the algorithm infers the most likely stable diffusion network that is underlying 28 | the diffusion process. 29 | 30 | ## Installation 31 | 32 | The package can be installed from [CRAN](https://CRAN.R-project.org/): 33 | ```{r, eval=FALSE} 34 | install.packages("NetworkInference") 35 | ``` 36 | 37 | The latest development version can be installed from 38 | [github](https://github.com/desmarais-lab/NetworkInference): 39 | ```{r, eval=FALSE} 40 | #install.packages(devtools) 41 | devtools::install_github('desmarais-lab/NetworkInference') 42 | ``` 43 | 44 | ## Quick start guide 45 | 46 | To get started, get your data into the `cascades` format required by the `netinf` 47 | function: 48 | 49 | ```{r, results='hide', message=FALSE} 50 | library(NetworkInference) 51 | 52 | # Simulate random cascade data 53 | df <- simulate_rnd_cascades(50, n_node = 20) 54 | 55 | # Cast data into `cascades` object 56 | ## From long format 57 | cascades <- as_cascade_long(df) 58 | 59 | ## From wide format 60 | df_matrix <- as.matrix(cascades) ### Create example matrix 61 | cascades <- as_cascade_wide(df_matrix) 62 | ``` 63 | 64 | Then fit the model: 65 | ```{r} 66 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05) 67 | ``` 68 | 69 | ```{r, eval=FALSE} 70 | head(result) 71 | ``` 72 | ```{r, results="asis", echo=FALSE} 73 | pander::pandoc.table(head(result)) 74 | ``` -------------------------------------------------------------------------------- /tests/testthat/test_cascade.R: -------------------------------------------------------------------------------- 1 | library(NetworkInference) 2 | 3 | set.seed(2552) 4 | context("Test if cascade data structure and related methods work.") 5 | 6 | n_casc <- 100 7 | n_nodes <- 100 8 | 9 | # Long format 10 | test_that("Long format", { 11 | dat <- simulate_rnd_cascades(n_casc, n_nodes) 12 | casc <- as_cascade_long(data = dat, cascade_node_name = "node_name", 13 | event_time = "event_time", 14 | cascade_id = "cascade_id") 15 | df <- as.data.frame(casc) 16 | df <- df[order(df$cascade_id, df$node_name), ] 17 | dat$cascade_id <- as.character(dat$cascade_id) 18 | dat <- dat[order(dat$cascade_id, dat$node_name), ] 19 | rownames(dat) <- c(1:nrow(dat)) 20 | rownames(df) <- c(1:nrow(df)) 21 | expect_equal(dat, df) 22 | }) 23 | 24 | # Wide format 25 | test_that("Wide format", { 26 | dat <- simulate_rnd_cascades(n_casc, n_nodes) 27 | casc <- as_cascade_long(data = dat, cascade_node_name = "node_name", 28 | event_time = "event_time", 29 | cascade_id = "cascade_id") 30 | m <- as.matrix(casc) 31 | cascade <- as_cascade_wide(m) 32 | expect_equal(casc, cascade) 33 | }) 34 | 35 | # Subsetting 36 | test_that("Subsetting works", { 37 | data("cascades") 38 | reduced_cascade <- subset_cascade_time(cascades, 10, 20, drop=TRUE) 39 | expect_equal(length(reduced_cascade), length(cascades)) 40 | expect_equal(length(reduced_cascade$cascade_nodes), length(reduced_cascade$cascade_times)) 41 | expect_equal(min(do.call(c, reduced_cascade$cascade_times)), 10.1, tolerance = 0.1) 42 | expect_equal(max(do.call(c, reduced_cascade$cascade_times)), 19.8, tolerance = 0.1) 43 | reduced_cascade <- subset_cascade_time(cascades, 10, 20, drop=FALSE) 44 | expect_equal(length(reduced_cascade), length(cascades)) 45 | expect_equal(length(reduced_cascade$cascade_nodes), length(reduced_cascade$cascade_times)) 46 | expect_equal(length(reduced_cascade$cascade_nodes), length(cascades$cascade_nodes)) 47 | expect_equal(length(reduced_cascade$cascade_times), length(cascades$cascade_times)) 48 | expect_equal(min(do.call(c, reduced_cascade$cascade_times)), 10.1, tolerance = 0.1) 49 | expect_equal(max(do.call(c, reduced_cascade$cascade_times)), 19.8, tolerance = 0.1) 50 | expect_warning(as.data.frame(reduced_cascade)) 51 | }) 52 | -------------------------------------------------------------------------------- /man/simulate_cascades.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simulate_cascades.R 3 | \name{simulate_cascades} 4 | \alias{simulate_cascades} 5 | \title{Simulate cascades from a diffusion network} 6 | \usage{ 7 | simulate_cascades(diffnet, nsim = 1, max_time = Inf, 8 | start_probabilities = NULL, partial_cascade = NULL, params = NULL, 9 | model = NULL, nodes = NULL) 10 | } 11 | \arguments{ 12 | \item{diffnet}{object of class \code{diffnet}.} 13 | 14 | \item{nsim}{integer, number of cascades to simulate.} 15 | 16 | \item{max_time}{numeric, the maximum time after which observations are 17 | censored} 18 | 19 | \item{start_probabilities}{a vector of probabilities for each node in diffnet, 20 | to be the node with the first event. If \code{NULL} a node is drawn from 21 | a uniform distribution over all nodes.} 22 | 23 | \item{partial_cascade}{object of type cascade, containing one partial 24 | cascades for which further development should be simulated.} 25 | 26 | \item{params}{numeric, (optional) parameters for diffusion time distribution. 27 | See the details section of \code{\link{netinf}} for specification details. 28 | Only use this argument if parameters different from those contained in the 29 | \code{diffnet} object should be used or the network is not an object of 30 | class \code{diffnet}.} 31 | 32 | \item{model}{character, diffusion model to use. One of \code{c("exponential", 33 | "rayleigh", "log-normal")}. Only use this argument if parameters different 34 | from those contained in the \code{diffnet} object should be used or the 35 | network is not an object of class \code{diffnet}.} 36 | 37 | \item{nodes}{vector of node ids if different from nodes included in 38 | \code{diffnet}} 39 | } 40 | \value{ 41 | A data frame with three columns. Containing 1) The names of 42 | the nodes (\code{"node_name"}) that experience an event in each cascade, 43 | 2) the event time (\code{"event_time"}) of the corresponding node, 44 | 3) the cascade identifier \code{"cascade_id"}. 45 | } 46 | \description{ 47 | Simulate diffusion cascades based on the generative model underlying netinf 48 | and a diffusion network. 49 | } 50 | \examples{ 51 | 52 | data(cascades) 53 | out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1) 54 | simulated_cascades <- simulate_cascades(out, nsim = 10) 55 | 56 | # Simulation from partial cascade 57 | 58 | } 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # NetworkInference: Inferring Latent Diffusion Networks 5 | 6 | ![](https://travis-ci.org/desmarais-lab/NetworkInference.svg) 7 | ![](http://www.r-pkg.org/badges/version/NetworkInference) 8 | ![](http://cranlogs.r-pkg.org/badges/NetworkInference) 9 | ![](http://cranlogs.r-pkg.org/badges/grand-total/NetworkInference?color=yellow) 10 | 11 | ## About 12 | 13 | This package provides an R implementation of the netinf algorithm 14 | created by Gomez-Rodriguez, Leskovec, and Krause (see 15 | [here](http://snap.stanford.edu/netinf/) for more information and the 16 | original C++ implementation). Given a set of events that spread between 17 | a set of nodes the algorithm infers the most likely stable diffusion 18 | network that is underlying the diffusion process. 19 | 20 | ## Installation 21 | 22 | The package can be installed from [CRAN](https://CRAN.R-project.org/): 23 | 24 | ``` r 25 | install.packages("NetworkInference") 26 | ``` 27 | 28 | The latest development version can be installed from 29 | [github](https://github.com/desmarais-lab/NetworkInference): 30 | 31 | ``` r 32 | #install.packages(devtools) 33 | devtools::install_github('desmarais-lab/NetworkInference') 34 | ``` 35 | 36 | ## Quick start guide 37 | 38 | To get started, get your data into the `cascades` format required by the 39 | `netinf` function: 40 | 41 | ``` r 42 | library(NetworkInference) 43 | 44 | # Simulate random cascade data 45 | df <- simulate_rnd_cascades(50, n_node = 20) 46 | 47 | # Cast data into `cascades` object 48 | ## From long format 49 | cascades <- as_cascade_long(df) 50 | 51 | ## From wide format 52 | df_matrix <- as.matrix(cascades) ### Create example matrix 53 | cascades <- as_cascade_wide(df_matrix) 54 | ``` 55 | 56 | Then fit the model: 57 | 58 | ``` r 59 | result <- netinf(cascades, quiet = TRUE, p_value_cutoff = 0.05) 60 | ``` 61 | 62 | ``` r 63 | head(result) 64 | ``` 65 | 66 | | origin\_node | destination\_node | improvement | p\_value | 67 | | :----------: | :---------------: | :---------: | :-------: | 68 | | 20 | 7 | 290.1 | 7.324e-06 | 69 | | 8 | 17 | 272 | 1.875e-05 | 70 | | 3 | 2 | 270.5 | 1.87e-05 | 71 | | 20 | 5 | 262.8 | 1.899e-05 | 72 | | 7 | 16 | 250.4 | 4.779e-05 | 73 | | 20 | 15 | 249 | 4.774e-05 | 74 | -------------------------------------------------------------------------------- /src/possible_edges.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::depends(RcppProgress)]] 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "possible_edges.h" 7 | 8 | using namespace Rcpp; 9 | 10 | edge_map get_possible_edges_(List &cascade_nodes, List &cascade_times, 11 | bool& quiet) { 12 | edge_map possible_edges; 13 | int n_cascades = cascade_nodes.size(); 14 | if(!quiet) Rcout << "Getting possible edges...\n"; 15 | Progress p(n_cascades, !quiet); 16 | for(int c = 0; c < n_cascades; c++) { 17 | checkUserInterrupt(); 18 | IntegerVector this_cascade_nodes = cascade_nodes[c]; 19 | NumericVector this_cascade_times = cascade_times[c]; 20 | int csize = this_cascade_nodes.size(); 21 | 22 | // Use the fact that the cascade data is ordered (see cascade.R) 23 | for(int i = 0; i < csize; i++) { 24 | int parent = this_cascade_nodes[i]; 25 | double t_parent = this_cascade_times[i]; 26 | for(int j = i + 1; j < csize; j++) { 27 | int child = this_cascade_nodes[j]; 28 | double t_child = this_cascade_times[j]; 29 | 30 | // If times are tied skip this combination 31 | if(t_parent >= t_child) { 32 | continue; 33 | } 34 | 35 | // Check if pair is in pair collection. If not include 36 | std::array pair_id = {{child, parent}}; 37 | 38 | auto it = possible_edges.find(pair_id); 39 | if(it == possible_edges.end()) { 40 | std::vector possible_cascades; 41 | possible_cascades.push_back(c); 42 | double improvement = -1; 43 | edge_value value = make_pair(possible_cascades, 44 | improvement); 45 | possible_edges.insert(make_pair(pair_id, value)); 46 | } else { 47 | it->second.first.push_back(c); 48 | } 49 | } 50 | } 51 | p.increment(); 52 | } 53 | return possible_edges; 54 | } 55 | 56 | //[[Rcpp::export]] 57 | int count_possible_edges_(List &cascade_nodes, List &cascade_times, 58 | bool quiet=true) { 59 | edge_map edges = get_possible_edges_(cascade_nodes, cascade_times, quiet); 60 | return edges.size(); 61 | } 62 | -------------------------------------------------------------------------------- /src/spanning_tree.h: -------------------------------------------------------------------------------- 1 | using namespace Rcpp; 2 | 3 | /** 4 | * Calculate the weighted log likelihood of an edge (i -> j) in a tree. 5 | * 6 | * @param event_time_i Time node i experienced the event 7 | * @param event_time_j Time node j experienced the event 8 | * @param model The diffusion model. 9 | * @param params Parameters of the density funtion of the diffusion 10 | * model. 11 | * @param tied Is the edge tied in the tree/cascade or is it out of network 12 | * diffusion 13 | * 14 | * @return Weighted log-likelihood score of the edge 15 | */ 16 | double edge_score(double &event_time_i, double &event_time_j, std::string &model, 17 | NumericVector ¶ms, bool tied); 18 | 19 | /** 20 | * Generate the optimal spanning tree for a cascade. 21 | * 22 | * @param cascade_nodes Integer vector of node ids in the order in which they 23 | * experienced the event. 24 | * @param cascade_times The event times for the corresponding nodes in 25 | * cascade_nodes. 26 | * @param model The diffusion model. 27 | * @param params Parameters of the density funtion of the diffusion 28 | * model. 29 | * 30 | * @return A list containing two vectors and a scalar: 31 | * [0] Integer vector of parent ids. Each id is the parent of the node in the 32 | * corresponding position of the input `cascade_nodes` 33 | * [1] Numeric vector of likelihood scores. Each score is the score for the 34 | * edge form the node in [0] to the node in the respective position in input 35 | * `cascade_nodes` 36 | * [2] The total score of the tree (sum of all edge scores) 37 | */ 38 | List optimal_spanning_tree(IntegerVector &cascade_nodes, 39 | NumericVector &cascade_times, std::string &model, 40 | NumericVector ¶ms); 41 | 42 | /** 43 | * Construct the optimal spanning tree for all cascades 44 | * 45 | * @param cascade_nodes List of integer vectors of node ids in the order in 46 | * which they experienced the event in the respective cascade 47 | * @param cascade_times List of numeric vectors containing the event times 48 | * for the corresponding nodes in cascade_nodes. 49 | * @param params Parameters of the density funtion of the diffusion 50 | * model. 51 | * @param model The diffusion model. 52 | * 53 | * @returns A list containing the optimal spanning tree for each cascade (see 54 | * optimal spanning tree for the data format of each tree) 55 | */ 56 | List initialize_trees(List &cascade_nodes, List &cascade_times, 57 | NumericVector ¶ms, std::string &model); -------------------------------------------------------------------------------- /man/as_cascade_long.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cascade.R 3 | \name{as_cascade_long} 4 | \alias{as_cascade_long} 5 | \title{Transform long data to cascade} 6 | \usage{ 7 | as_cascade_long(data, cascade_node_name = "node_name", 8 | event_time = "event_time", cascade_id = "cascade_id", 9 | node_names = NULL) 10 | } 11 | \arguments{ 12 | \item{data}{\link{data.frame}, containing the cascade data 13 | with column names corresponding to the arguments provided to 14 | \code{cascade_node_names}, \code{event_time} and \code{cascade_id}.} 15 | 16 | \item{cascade_node_name}{character, column name of \code{data} that specifies 17 | the node names in the cascade.} 18 | 19 | \item{event_time}{character, column name of \code{data} that specifies the 20 | event times for each node involved in a cascade.} 21 | 22 | \item{cascade_id}{character, column name of the cascade identifier.} 23 | 24 | \item{node_names}{character, factor or numeric vector containing the names for each node. 25 | Optional. If not provided, node names are inferred from the cascade data.} 26 | } 27 | \value{ 28 | An object of class \code{cascade}. This is a list containing three 29 | (named) elements: 30 | \enumerate{ 31 | \item \code{"node_names"} A character vector of node names. 32 | \item \code{"cascade_nodes"} A list with one character vector per 33 | cascade containing the node names in order of the events. 34 | \item \code{"cascade_times"} A list with one element per cascade 35 | containing the event times for the nodes in \code{"cascade_names"}. 36 | } 37 | } 38 | \description{ 39 | Create a cascade object from data in long format. 40 | } 41 | \details{ 42 | Each row of the data describes one event in the cascade. The data must 43 | contain at least three columns: 44 | \enumerate{ 45 | \item Cascade node name: The identifier of the node that experiences the 46 | event. 47 | \item Event time: The time when the node experiences the event. Note that 48 | if the time column is of class date or any other special time class, 49 | it will be converted to an integer with `as.numeric()`. 50 | \item Cascade id: The identifier of the cascade that the event pertains to. 51 | } 52 | The default names for these columns are \code{node_name}, \code{event_time} 53 | and \code{cascade_id}. If other names are used in the \code{data} object the 54 | names have to be specified in the corresponding arguments (see argument 55 | documentation) 56 | } 57 | \examples{ 58 | 59 | df <- simulate_rnd_cascades(10, n_nodes = 20) 60 | cascades <- as_cascade_long(df) 61 | is.cascade(cascades) 62 | } 63 | -------------------------------------------------------------------------------- /man/policies.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/data.R 3 | \docType{data} 4 | \name{policies} 5 | \alias{policies} 6 | \alias{policies_metadata} 7 | \title{US State Policy Adoption (SPID)} 8 | \format{The data comes in two objects of class \code{data.frame}. The first 9 | object, named \code{policies} contains the adoption events. Each row 10 | corresponds to an adoption event. Each adoption event is described by 11 | the three columns: 12 | \itemize{ 13 | \item \code{statenam}: Name of the adopting state. 14 | \item \code{policy}: Name of the policy. 15 | \item \code{adopt_year}: Year when the state adopted the policy. 16 | } 17 | The second object (\code{policies_metadata}) contains more details on each 18 | of the policies. It contains these columns: 19 | \itemize{ 20 | \item \code{policy}: Name of the policy. 21 | \item \code{source}: Original source of the data. 22 | \item \code{first_year}: First year any state adopted this policy. 23 | \item \code{last_year}: Last year any state adopted this policy. 24 | \item \code{adopt_count}: Number of states that adopted this policy. 25 | \item \code{description}: Description of the policy. 26 | \item \code{majortopic}: Topic group the policy belongs to. 27 | } 28 | Both \code{data.frame} objects can be joined (merged) on the common column 29 | \code{policy} (see example code).} 30 | \source{ 31 | \doi{10.7910/DVN/CVYSR7} 32 | } 33 | \usage{ 34 | data(policies) 35 | } 36 | \description{ 37 | The SPID data includes information on the year of adoption for over 700 38 | policies in the American states. 39 | } 40 | \details{ 41 | This version 1.0 of the database. For each policy we document the year of first 42 | adoption for each state. Adoption dates range from 1691 to 2017 and includes 43 | all fifty states. Policies are adopted by anywhere from 1 to 50 states, with 44 | an average of 24 adoptions. The data were assembled from a variety of sources, 45 | including academic publications and policy advocacy/information groups. 46 | Policies were coded according to the Policy Agendas Project major 47 | topic code. Additional information on policies is available at the source 48 | repository. 49 | } 50 | \examples{ 51 | 52 | data('policies') 53 | 54 | # Join the adoption events with the metadata 55 | merged_policies <- merge(policies, policies_metadata, by = 'policy') 56 | } 57 | \references{ 58 | Boehmke, Frederick J.; Mark Brockway; Bruce A. Desmarais; 59 | Jeffrey J. Harden; Scott LaCombe; Fridolin Linder; and 60 | Hanna Wallach. 2018. "A New Database for Inferring Public Policy 61 | Innovativeness and Diffusion Networks." Working paper. 62 | } 63 | \keyword{datasets} 64 | -------------------------------------------------------------------------------- /R/summary.R: -------------------------------------------------------------------------------- 1 | #' Summarize a cascade object 2 | #' 3 | #' Generates summary statistics for single cascades and across cascades in a 4 | #' collection, contained in a cascades object. 5 | #' 6 | #' @param object object of class cascade to be summarized. 7 | #' @param quiet logical, if \code{FALSE} summary stats are printed to std out. 8 | #' @param ... Additional arguments passed to summary. 9 | #' @return Prints cascade summary information to the screen 10 | #' (if \code{quiet = FALSE}). \code{'# cascades'} is the number of cascades in 11 | #' the object, \code{'# nodes'} is the number of nodes in the system (nodes 12 | #' that can theoretically experience an event), \code{'# nodes in cascades'} is 13 | #' the number of unique nodes of the system that experienced an event and 14 | #' \code{'# possible edges'} is the number of edges that are possible given 15 | #' the cascade data (see \code{\link{count_possible_edges}} for details.). 16 | #' 17 | #' Additional summaries for each cascade are returned invisibly. 18 | # as a \code{data.frame} with columns \code{name} (name of the 19 | #' cascade), \code{length} (length of the cascade as an integer of how many 20 | #' nodes experienced and event) and \code{n_ties} (number of tied event 21 | #' times per cascade). 22 | #' 23 | #' @examples 24 | #' data(cascades) 25 | #' summary(cascades) 26 | #' 27 | #' @export 28 | summary.cascade <- function(object, quiet = FALSE, ...) { 29 | # Cascade info 30 | casc_lengths <- sapply(object$cascade_nodes, length) 31 | dups <- sapply(object$cascade_times, count_dups_) 32 | names(casc_lengths) <- NULL 33 | casc_names <- names(object$cascade_nodes) 34 | casc_info <- data.frame("name" = casc_names, "length" = casc_lengths, 35 | "n_ties" = dups) 36 | # Print out summaries 37 | if(!quiet) { 38 | # Calculate all summaries 39 | n_nodes <- length(object$node_names) 40 | n_nodes_in_casc <- length(unique(do.call(c, object$cascade_nodes))) 41 | npe <- count_possible_edges(object) 42 | pout <- cbind(summary(casc_lengths), summary(dups)) 43 | colnames(pout) <- c("length", "ties") 44 | cat(paste0('# cascades: ', length(object$cascade_times), '\n')) 45 | cat(paste0('# nodes: ', n_nodes, '\n')) 46 | cat(paste0('# nodes in cascades: ', n_nodes_in_casc, '\n')) 47 | cat(paste0('# possible edges: ', npe, '\n\n')) 48 | cat("Summary statistics for cascade length and number of ties:\n") 49 | print(pout) 50 | } 51 | 52 | return(invisible(casc_info)) 53 | } 54 | 55 | count_dups_ <- function(x) { 56 | return(length(x) - length(unique(x))) 57 | } 58 | 59 | -------------------------------------------------------------------------------- /src/netinf.h: -------------------------------------------------------------------------------- 1 | #include "possible_edges.h" 2 | 3 | using namespace Rcpp; 4 | 5 | /** 6 | * For each cascade the edge u -> v is possible check if it improves fit and 7 | * keep track of the ones where it does 8 | * 9 | * @param u Integer id of parent node 10 | * @param v Integer id of child node 11 | * @param possible_edges edge_map containing data on all possible edges 12 | * @param cascade_nodes A list of integer vectors containing the node ids of 13 | * the cascade in order of infection. 14 | * @param cascade_times A list of numeric vectors each containing infection 15 | * times for the corresponding nodes in \code{cascade_ids}. 16 | * 17 | * @return An Rcpp List containing: 18 | * [0]: Aggregate improvement from this edge over all trees 19 | * [1]: An integer vector of cascades where the edge caused improvement 20 | * [2]: The scores of the edge in each of the cascades in [1] 21 | */ 22 | List tree_replacement(int &u, int &v, edge_map &possible_edges, 23 | List &cascade_times, List &cascade_nodes, 24 | List &trees, std::string &model, NumericVector ¶ms); 25 | 26 | /** 27 | * Run the netinf algorithm on a set of nodes and cascades 28 | * 29 | * @param cascade_nodes A list of integer vectors containing the node ids of 30 | * the cascade in order of infection. 31 | * @param cascade_times A list of numeric vectors each containing infection 32 | * times for the corresponding nodes in \code{cascade_ids}. 33 | * @param model integer indicating the choice of model: 1: exponential, 34 | * 2: power law, 3: rayleigh (only exponential implemented). 35 | * @param params NumericVector, Parameters for transmission model. 36 | * @param n_edges Integer, number of edges to infer. 37 | * @param quiet, Boolean, Should output on progress by suppressed. 38 | * @param cutoff, p-value cutoff if auto-edges=TRUE 39 | * 40 | * @return List containing one vector per edge. 41 | */ 42 | List netinf_(List &cascade_nodes, List &cascade_times, 43 | int &n_edges, std::string &model, NumericVector ¶ms, bool quiet, 44 | bool auto_edges, double cutoff); 45 | 46 | /** 47 | * Update the trees for each cascade using the new edge 48 | * 49 | * @param trees List of trees for each cascade (see spanning_tree.h for 50 | * documentation of the data structure) 51 | * @param tree_scores Numeric Vector of aggregate likelihood scores for each tree 52 | * @param replacement_data Return object from tree_replacement. 53 | * @param cascade_nodes A list of integer vectors containing the node ids of 54 | * the cascade in order of infection. 55 | * @param best_edge node ids of the edge that's updated 56 | * @param 57 | */ 58 | void update_trees(List &trees, NumericVector &tree_scores, 59 | List &replacement_data, List &cascade_nodes, 60 | std::array best_edge); -------------------------------------------------------------------------------- /man/netinf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/netinf.R 3 | \name{netinf} 4 | \alias{netinf} 5 | \title{Infer latent diffusion network} 6 | \usage{ 7 | netinf(cascades, trans_mod = "exponential", n_edges = NULL, 8 | p_value_cutoff = NULL, params = NULL, quiet = FALSE, 9 | trees = FALSE) 10 | } 11 | \arguments{ 12 | \item{cascades}{an object of class cascade containing node and cascade 13 | information. See \code{\link{as_cascade_long}} and 14 | \code{\link{as_cascade_wide}} for details.} 15 | 16 | \item{trans_mod}{character, indicating the choice of model: 17 | \code{"exponential"}, \code{"rayleigh"} or \code{"log-normal"}.} 18 | 19 | \item{n_edges}{integer, number of edges to infer. Leave unspecified if using 20 | \code{p_value_cutoff}.} 21 | 22 | \item{p_value_cutoff}{numeric, in the interval (0, 1). If 23 | specified, edges are inferred in each iteration until the Vuong test for 24 | edge addition reaches the p-value cutoff or when the maximum 25 | possible number of edges is reached. Leave unspecified if using 26 | \code{n_edges} to explicitly specify number of edges to infer.} 27 | 28 | \item{params}{numeric, Parameters for diffusion model. If left unspecified 29 | reasonable parameters are inferred from the data. See details for how to 30 | specify parameters for the different distributions.} 31 | 32 | \item{quiet}{logical, Should output on progress by suppressed.} 33 | 34 | \item{trees}{logical, Should the inferred cascade trees be returned. Note, 35 | that this will lead to a different the structure of the function output. 36 | See section Value for details.} 37 | } 38 | \value{ 39 | Returns the inferred diffusion network as an edgelist in an object of 40 | class \code{diffnet} and \code{\link[base]{data.frame}}. The first 41 | column contains the sender, the second column the receiver node. The 42 | third column contains the improvement in fit from adding the edge that is 43 | represented by the row. The output additionally has the following 44 | attributes: 45 | \itemize{ 46 | \item \code{"diffusion_model"}: The diffusion model used to infer the 47 | diffusion network. 48 | \item \code{"diffusion_model_parameters"}: The parameters for the 49 | model that have been inferred by the approximate profile MLE 50 | procedure. 51 | } 52 | If the argument \code{trees} is set to \code{TRUE}, the output is a list 53 | with the first element being the \code{data.frame} described above, and 54 | the second element being the trees in edge-list form in a single 55 | \code{data.frame}. 56 | } 57 | \description{ 58 | Infer a network of diffusion ties from a set of cascades. Each cascade 59 | is defined by pairs of node ids and infection times. 60 | } 61 | \details{ 62 | The algorithm is describe in detail in Gomez-Rodriguez et al. (2010). 63 | Additional information can be found on the 64 | netinf website (\url{http://snap.stanford.edu/netinf/}). 65 | 66 | \itemize{ 67 | \item Exponential distribution: \code{trans_mod = "exponential"}, 68 | \code{params = c(lambda)}. 69 | Parametrization: \eqn{\lambda e^{-\lambda x}}. 70 | \item Rayleigh distribution: \code{trans_mod = "rayleigh"}, 71 | \code{params = c(alpha)}. 72 | Parametrization: \eqn{\frac{x}{\alpha^2} \frac{e^{-x^2}}{2\alpha^2}}. 73 | \item Log-normal distribution: \code{trans_mod = "log-normal"}, 74 | \code{params = c(mu, sigma)}. 75 | Parametrization: \eqn{\frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(ln x - \mu)^2}{2\sigma^2}}}. 76 | } 77 | 78 | If higher performance is required and for very large data sets, a faster pure C++ 79 | implementation is available in the Stanford Network Analysis Project (SNAP). 80 | The software can be downloaded at \url{http://snap.stanford.edu/netinf/}. 81 | } 82 | \examples{ 83 | 84 | # Data already in cascades format: 85 | data(cascades) 86 | out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1) 87 | 88 | # Starting with a dataframe 89 | df <- simulate_rnd_cascades(10, n_nodes = 20) 90 | cascades2 <- as_cascade_long(df, node_names = unique(df$node_name)) 91 | out <- netinf(cascades2, trans_mod = "exponential", n_edges = 5, params = 1) 92 | 93 | } 94 | \references{ 95 | M. Gomez-Rodriguez, J. Leskovec, A. Krause. Inferring Networks of Diffusion 96 | and Influence.The 16th ACM SIGKDD Conference on Knowledge Discovery and 97 | Data Mining (KDD), 2010. 98 | } 99 | -------------------------------------------------------------------------------- /src/spanning_tree.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "distributions.h" 3 | 4 | using namespace Rcpp; 5 | 6 | double edge_score(double &event_time_i, double &event_time_j, std::string &model, 7 | NumericVector ¶ms, bool tied) { 8 | double x = event_time_j - event_time_i; 9 | double y; 10 | double out; 11 | if (model == "exponential") { 12 | y = dexp_(x, params[0]); 13 | } else if (model == "rayleigh") { 14 | y = drayleigh_(x, params[0]); 15 | } else if (model == "log-normal") { 16 | y = dlognormal_(x, params[0], params[1]); 17 | } 18 | if (tied) { 19 | out = log(0.5 * y); 20 | } else { 21 | out = log(0.0000000001 * y); 22 | } 23 | return out; 24 | } 25 | 26 | List optimal_spanning_tree(IntegerVector &cascade_nodes, 27 | NumericVector &cascade_times, std::string &model, 28 | NumericVector ¶ms) { 29 | 30 | // Init containers for the results 31 | int cascade_size = cascade_nodes.size(); 32 | NumericVector parent_scores(cascade_size); 33 | IntegerVector parent_ids(cascade_size); 34 | 35 | // For each node involved in this cascade find the parent and the weight for 36 | // the respective edge 37 | double tree_score = 0; 38 | for(int i = 0; i < cascade_size; i++) { 39 | // Only nodes that have an earlier event time can be parents for current 40 | // node 41 | IntegerVector possible_parents; 42 | NumericVector parent_times; 43 | for(int j = 0; j < cascade_size; j++) { 44 | if (cascade_times[j] < cascade_times[i]) { 45 | possible_parents.push_back(cascade_nodes[j]); 46 | parent_times.push_back(cascade_times[j]); 47 | } 48 | } 49 | // Find the parent with the highest score if there are possible parents 50 | int n_parents = possible_parents.size(); 51 | // If there are multiple potential parents find the one that gives the e 52 | // edge the maximum weight 53 | if (n_parents > 0) { 54 | double max_parent_score = -INFINITY; 55 | int parent; 56 | double score; 57 | for (int k = 0; k < n_parents; k++) { 58 | score = edge_score(parent_times[k], cascade_times[i], model, 59 | params, false); 60 | if (score > max_parent_score) { 61 | max_parent_score = score; 62 | parent = possible_parents[k]; 63 | } 64 | } 65 | // Select the parent with the max score and store the score 66 | if(max_parent_score == -INFINITY) { 67 | std::string msg = "Observed time with zero likelihood. Consider adjusting lambda.\n"; 68 | throw std::invalid_argument(msg); 69 | } 70 | parent_ids[i] = parent; 71 | parent_scores[i] = max_parent_score; 72 | tree_score += max_parent_score; 73 | 74 | // If node can't have parent (fist node in cascade or tied first nodes) 75 | // set parent id and score to NA 76 | } else { 77 | parent_ids[i] = -1; 78 | parent_scores[i] = NA_REAL; 79 | } 80 | } 81 | List out = List::create(parent_ids, parent_scores, tree_score); 82 | return out; 83 | } 84 | 85 | List initialize_trees(List &cascade_nodes, List &cascade_times, 86 | NumericVector ¶ms, std::string &model) { 87 | 88 | // Output container 89 | int n_cascades = cascade_nodes.size(); 90 | List out(n_cascades); 91 | NumericVector tree_scores(n_cascades, NA_REAL); 92 | 93 | // Calculate optimal spanning tree for each cascade 94 | for(int i = 0; i < n_cascades; i++) { 95 | checkUserInterrupt(); 96 | IntegerVector this_cascade_ids = cascade_nodes[i]; 97 | NumericVector this_cascade_times = cascade_times[i]; 98 | List tree_result = optimal_spanning_tree(this_cascade_ids, 99 | this_cascade_times, model, 100 | params); 101 | tree_scores[i] = tree_result[2]; 102 | out[i] = tree_result; 103 | } 104 | return List::create(out, tree_scores); 105 | } 106 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # NetworkInference 1.2.5.9000 2 | 3 | Development Version 4 | 5 | ## New Features 6 | 7 | ## Bug Fixes 8 | 9 | # NetworkInference 1.2.5 10 | 11 | * Update C++11 -> C++17 12 | * Minor fixes to package metadata 13 | 14 | # NetworkInference 1.2.4 15 | 16 | ## New Features 17 | 18 | ## Bug Fixes 19 | * Fixed potential memory error from iterating over the beginning/end of a `std::map`. The last fix didn't work under all circumstances. 20 | 21 | # NetworkInference 1.2.3 22 | 23 | ## New Features 24 | 25 | ## Bug Fixes 26 | * Fixed potential memory error from iterating over the beginning/end of a `std::map` 27 | 28 | # NetworkInference 1.2.2 29 | 30 | ## New Features 31 | 32 | ## Bug Fixes 33 | * `netinf()` with `log-normal` model didn't run because of an index error in the argument check for `params` 34 | * Fixed memory allocation error caused by uninitialized comparison 35 | 36 | # NetworkInference 1.2.1 37 | 38 | ## Bug Fixes 39 | 40 | * `netinf_` used ceiling on integer which caused error on Solaris 41 | 42 | # NetworkInference 1.2.0 43 | 44 | 45 | ## New Features 46 | 47 | #### Changes to `netinf()` 48 | 49 | * `netinf()` got another **speed-up**. After the first edge, the computation 50 | time for each edge is reduced by the factor number of nodes in the network 51 | * Number of edges can now be chosen using a **Vuong style test**. If this 52 | procedure should be used, a p-value is chosen at which the inference of new 53 | edges stops. This value is specified via the new `p_value_cutoff` argument 54 | to `netinf()`. 55 | * This lead to the netinf output having a **fourth column** now, containing the 56 | p-value for each edge. The p-value is also available if a fixed number of edges 57 | is chosen. 58 | * If no starting values are provided via the `params` argument parameters 59 | are initialized by choosing the midpoint between the maximum possible 60 | parameter value and the minimum possible value. These values are derived 61 | using the closed form MLE of the respective parameter, derived from 62 | either the minimum possible diffusion times (assuming a diffusion 63 | 'chain', i.e. `a -> b -> c -> ...`) or the maximum possible diffusion 64 | times (assuming a diffusion 'fan', i.e. `a -> b, a -> c, a -> d,...`). 65 | * `n_edges` can now specify either an absolute number of edges, or a p-value 66 | cutoff in the interval `(0, 1)` for the Vuong test 67 | * The log normal distribution is now available as a diffusion model. With this 68 | comes a **change in the arguments** for `netinf`. Instead of `lambda`, 69 | parameters are now specified with a vector (or scalar depending on 70 | distribution) `params`. For exponential and rayleigh distributions `params` 71 | is just the rate / alpha parameter. For the log-normal distribution `params` 72 | specifies mean and variance (in that order). See the `netinf()` 73 | documentation for details on specificaiton and parametrization (`?netinf`). 74 | * The output from `netinf()` now contains information on the model, parameters 75 | and iterations as attributes. See the documentation for details. 76 | * The `policies` dataset has been updated with over 600 new policies from the 77 | [SPID](https://doi.org/10.7910/DVN/CVYSR7) database 78 | (access via `data(policies)`). 79 | * Inferred cascade trees can now be returned by setting `trees = TRUE`. 80 | 81 | #### New functions 82 | * New function `drop_nodes()` now allows to drop nodes from all cascades in a cascade object. 83 | 84 | #### Changes to `simulate_cascades()` 85 | * `simulate_cascades()` now supports passing of additional (isolated in the diffusion network) nodes via the `nodes` argument. 86 | * `simulate_cascades()` now also supports the log-normal distribution. 87 | 88 | 89 | ## Bug Fixes 90 | 91 | * Inference of very uninformative edges (large number of edges) could lead for the software to break. Fixed now 92 | * In `simulate_cascades()` with partial cascades provided, it was possible that nodes experienced an event earlier than the last event in the partial cascade. Now, the earliest event time is the last observed event time in the partial cascade. 93 | 94 | ## Other changes 95 | 96 | * C++ code is now modularized and headers are properly documented 97 | 98 | 99 | # NetworkInference 1.1.2 100 | 101 | 102 | ## New Features 103 | 104 | * We made changes to the internal data structures of the netinf function, so it is much faster and memory efficient now. 105 | * `netinf()` now has a shiny progress bar! 106 | * `as.cascade` is now completely removed (see release note on version 1.1.0). 107 | * New convenience function to subset cascades by time (`subset_cascade_time`) and by cascade id (`subset_cascade`). 108 | 109 | ## Bug Fixes 110 | * Long running functions (that call compiled code) can now be interrupted without crashing the R session. 111 | * `as_cascade_long()` and `as_cascade_wide()` handle date input correctly now. 112 | * `as_cascade_wide()` couldn't handle data input of class `data.table`. 113 | 114 | 115 | # NetworkInference 1.1.1 116 | 117 | 118 | ## Bug Fixes 119 | 120 | * Use of igraph now conditional compliant with Writing R Extensions [1.1.3.1](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Suggested-packages) 121 | * Fixed version number displayed in startup message 122 | 123 | 124 | # NetworkInference 1.1.0 125 | 126 | ## New Features 127 | 128 | * Data format (long or wide) of `as.cascade` is not bound to the class of the data object anymore. In 1.0.0 wide format had to be a matrix and long format had to be a dataframe. This did not make much sense. `as.cascade` is now deprecated and replaced by two new functions `as_cascade_long` and `as_cascade_wide`. 129 | 130 | ## Bug Fixes 131 | 132 | * x and y axis labels in `plot.cascade` with option `label_nodes=FALSE` were 133 | reversed 134 | 135 | 136 | # NetworkInference 1.0.0 137 | 138 | First release 139 | -------------------------------------------------------------------------------- /R/data.R: -------------------------------------------------------------------------------- 1 | #' US State Policy Adoption (SPID) 2 | #' 3 | #' The SPID data includes information on the year of adoption for over 700 4 | #' policies in the American states. 5 | #' 6 | #' This version 1.0 of the database. For each policy we document the year of first 7 | #' adoption for each state. Adoption dates range from 1691 to 2017 and includes 8 | #' all fifty states. Policies are adopted by anywhere from 1 to 50 states, with 9 | #' an average of 24 adoptions. The data were assembled from a variety of sources, 10 | #' including academic publications and policy advocacy/information groups. 11 | #' Policies were coded according to the Policy Agendas Project major 12 | #' topic code. Additional information on policies is available at the source 13 | #' repository. 14 | #' 15 | #' @name policies 16 | #' 17 | #' @usage data(policies) 18 | #' @docType data 19 | #' 20 | #' @format The data comes in two objects of class \code{data.frame}. The first 21 | #' object, named \code{policies} contains the adoption events. Each row 22 | #' corresponds to an adoption event. Each adoption event is described by 23 | #' the three columns: 24 | #' \itemize{ 25 | #' \item \code{statenam}: Name of the adopting state. 26 | #' \item \code{policy}: Name of the policy. 27 | #' \item \code{adopt_year}: Year when the state adopted the policy. 28 | #' } 29 | #' The second object (\code{policies_metadata}) contains more details on each 30 | #' of the policies. It contains these columns: 31 | #' \itemize{ 32 | #' \item \code{policy}: Name of the policy. 33 | #' \item \code{source}: Original source of the data. 34 | #' \item \code{first_year}: First year any state adopted this policy. 35 | #' \item \code{last_year}: Last year any state adopted this policy. 36 | #' \item \code{adopt_count}: Number of states that adopted this policy. 37 | #' \item \code{description}: Description of the policy. 38 | #' \item \code{majortopic}: Topic group the policy belongs to. 39 | #' } 40 | #' Both \code{data.frame} objects can be joined (merged) on the common column 41 | #' \code{policy} (see example code). 42 | #' 43 | #' @source \url{https://doi.org/10.7910/DVN/CVYSR7} 44 | #' 45 | #' @aliases policies policies_metadata 46 | #' 47 | #' @references Boehmke, Frederick J.; Mark Brockway; Bruce A. Desmarais; 48 | #' Jeffrey J. Harden; Scott LaCombe; Fridolin Linder; and 49 | #' Hanna Wallach. 2018. "A New Database for Inferring Public Policy 50 | #' Innovativeness and Diffusion Networks." Working paper. 51 | #' 52 | #' @examples 53 | #' 54 | #' data('policies') 55 | #' 56 | #' # Join the adoption events with the metadata 57 | #' merged_policies <- merge(policies, policies_metadata, by = 'policy') 58 | "policies" 59 | 60 | #' Example cascades 61 | #' 62 | #' An example dataset of 31 nodes and 54 cascades. From the original netinf 63 | #' implementation in SNAP. 64 | #' 65 | #' @name cascades 66 | #' 67 | #' @usage data(cascades) 68 | #' @docType data 69 | #' 70 | #' @format An object of class \code{cascade} containing 4 objects 71 | #' \describe{ 72 | #' \item{node_names}{Character node names} 73 | #' \item{cascade_nodes}{A list of integer vectors. Each containing the names of the 74 | #' nodes infected in this cascades in the order of infection} 75 | #' \item{cascade_times}{A list of numeric vectors. Each containing the infection 76 | #' times for the corresponding nodes in cascade_nodes} 77 | #' } 78 | #' @source \url{https://github.com/snap-stanford/snap/blob/master/examples/netinf/example-cascades.txt} 79 | "cascades" 80 | 81 | 82 | 83 | #' Validation output from netinf source. 84 | #' 85 | #' Contains output from original netinf C++ implementation, executed on 86 | #' \code{\link{cascades}}. For testing purposes. 87 | #' 88 | #' @name validation 89 | #' 90 | #' @usage data(validation) 91 | #' @docType data 92 | #' 93 | #' @format An object of class \code{data.frame} with 6 columns, containing: 94 | #' \describe{ 95 | #' \item{origin_node}{Origin of diffusion edge.} 96 | #' \item{destination_node}{Destination node of diffusion edge.} 97 | #' \item{volume}{??} 98 | #' \item{marginal_gain}{Marginal gain from edge.} 99 | #' \item{median_time_difference}{Median time between events in origin and 100 | #' destination} 101 | #' \item{mean_time_difference}{Mean time between events in origin and 102 | #' destination} 103 | #' } 104 | #' 105 | #' @source Output from netinf example program (\url{https://github.com/snap-stanford/snap/tree/master/examples/netinf}). 106 | "validation" 107 | 108 | # # Code to generate validation dataset. Data is copied from netinf source example 109 | # # ouput 110 | # 111 | # dat <- c(23,0,21,3049.810355,1.543899,2.136381, 112 | # 9,5,21,3049.291245,1.574027,2.161101, 113 | # 0,31,20,2905.326401,1.889040,2.099126, 114 | # 5,14,16,2326.861114,1.732281,1.936626, 115 | # 5,3,15,2181.930785,1.229308,1.903394) 116 | # 117 | # validation <- data.frame(matrix(dat, nc = 6, byrow = TRUE)) 118 | # validation <- validation[order(validation[, 1], validation[, 2]), ] 119 | # rownames(validation) <- c(1:nrow(validation)) 120 | # validation[, 1] <- as.character(validation[, 1]) 121 | # validation[, 2] <- as.character(validation[, 2]) 122 | # colnames(validation) <- c("origin_node", "destination_node", "volume", 123 | # "marginal_gain", "median_time_difference", 124 | # "mean_time_difference") 125 | # save(validation, file = 'data/validation.RData') 126 | 127 | 128 | #' Larger simulated validation network. 129 | #' 130 | #' A network from simulated data. For testing purposes. 131 | #' 132 | #' @name sim_validation 133 | #' 134 | #' @usage data(sim_validation) 135 | #' @docType data 136 | #' 137 | #' @format An object of class \code{data.frame} with 4 columns, containing: 138 | #' \describe{ 139 | #' \item{origin_node}{Origin of diffusion edge.} 140 | #' \item{destination_node}{Destination node of diffusion edge.} 141 | #' \item{improvement}{Improvement in score for the edge} 142 | #' \item{p-value}{p-value for vuong test} 143 | #' } 144 | #' 145 | #' @source See code below. 146 | "sim_validation" 147 | 148 | # # Code to generate validation dataset. 149 | # set.seed(142857) 150 | # df <- simulate_rnd_cascades(50, 50) 151 | # cascades <- as_cascade_long(df) 152 | # network <- netinf(cascades, n_edges = 0.05) 153 | # sim_validation <- list("input" = cascades, "output" = network) 154 | # save(sim_validation, file = 'data/sim_validation.RData') 155 | -------------------------------------------------------------------------------- /R/plot.R: -------------------------------------------------------------------------------- 1 | # Common plotting elements 2 | # 3 | # Plotting layout for NetworkInference package. 4 | # 5 | # @param mode What elements to return. 6 | # 7 | # @return A ggplot object that can be added to a ggplot plot 8 | PLOT_THEME_ <- function(mode = NULL) { 9 | if(is.null(mode)) { 10 | out <- theme_bw() 11 | } else if(mode == "color") { 12 | out <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 13 | "#0072B2", "#D55E00", "#CC79A7") 14 | } 15 | return(out) 16 | } 17 | 18 | #' Plot a cascade object 19 | #' 20 | #' Allows plotting of one or multiple, labeled or unlabeled cascades. 21 | #' 22 | #' The function returns a ggplot plot object (class \code{gg, ggplot}) which 23 | #' can be modified like any other ggplot. See the ggplot documentation and the 24 | #' examples below for more details. 25 | #' 26 | #' @import ggplot2 27 | #' @import ggrepel 28 | #' @importFrom stats density 29 | #' 30 | #' @param x object of class cascade to be plotted. 31 | #' @param label_nodes logical, indicating if should the nodes in each cascade be 32 | #' labeled. If the cascades are very dense setting this to \code{FALSE} is 33 | #' recommended. 34 | #' @param selection a vector of cascade ids to plot. 35 | #' @param ... additional arguments passed to plot. 36 | #' 37 | #' @examples 38 | #' 39 | #' data(cascades) 40 | #' plot(cascades, selection = names(cascades$cascade_nodes)[1:5]) 41 | #' plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20)) 42 | #' 43 | #' # Modify resulting ggplot object 44 | #' library(ggplot2) 45 | #' p <- plot(cascades, label_nodes = FALSE, selection = sample(1:54, 20)) 46 | #' ## Add a title 47 | #' p <- p + ggtitle('Your Title') 48 | #' p 49 | #' ## Change Axis 50 | #' p <- p + xlab("Your modified y axis label") #x and y labels are flipped here 51 | #' p <- p + ylab("Your modified x axis label") #x and y labels are flipped here 52 | #' p 53 | #' 54 | #' @return A ggplot plot object. 55 | #' @export 56 | plot.cascade <- function(x, label_nodes = TRUE, selection = NULL, ...) { 57 | 58 | # Check inputs 59 | assert_that(inherits(x, "cascade")) 60 | assert_that(inherits(label_nodes, "logical")) 61 | pdat <- as.data.frame(x) 62 | 63 | # Select cascades 64 | if(!is.null(selection)) { 65 | # Check selection input 66 | assert_that(length(selection) >= 1) 67 | assert_that(is.element(class(selection), c("character", "numeric", 68 | "integer", "factor"))) 69 | chk <- is.element(selection, unique(pdat$cascade_id)) 70 | if(!all(chk)) { 71 | msg <- paste("The following cascade id(s) provided in `selection` do", 72 | "not exist in the cascade object:", 73 | paste0(selection[!chk], collapse = ","), "\n") 74 | stop(msg) 75 | } 76 | selection <- as.character(selection) 77 | # Slice data 78 | sel <- is.element(pdat$cascade_id, selection) 79 | pdat <- pdat[sel, ] 80 | } 81 | 82 | pdat$cascade_id <- as.factor(pdat$cascade_id) 83 | 84 | if(length(unique(pdat$cascade_id)) > 20 & label_nodes) { 85 | msg <- paste("Plotting more than 20 cascades with labels is not", 86 | "recommended. Set label_nodes to FALSE or choose a subset", 87 | "of cascades using the `selection` argument\n") 88 | warning(msg) 89 | } 90 | 91 | # Plot 92 | palette <- PLOT_THEME_(mode = "color") 93 | 94 | ## Labeled Plot 95 | if(label_nodes) { 96 | p <- ggplot() + 97 | geom_line(aes_string(x = "event_time", y = "cascade_id"), 98 | color = "grey", linetype = 2, data = pdat) + 99 | geom_label_repel(aes_string(label = "node_name", color = "node_name", 100 | x = "event_time", y = "cascade_id"), 101 | size = 2.5, data = pdat) + 102 | ylab("Cascade ID") + xlab("Time") + 103 | scale_color_discrete(guide = FALSE) 104 | ## Unlabeled plot 105 | } else { 106 | p <- ggplot(pdat, aes_string(x = "cascade_id", y = "event_time")) + 107 | geom_violin() + 108 | geom_jitter(height = 0, width = 0.05, alpha = 0.6, size = 0.5) + 109 | xlab("Cascade ID") + ylab("Time") + 110 | coord_flip() 111 | } 112 | 113 | ## Layout 114 | p <- p + 115 | PLOT_THEME_() 116 | return(p) 117 | } 118 | 119 | #' Visualize netinf output 120 | #' 121 | #' Visualize the inferred diffusion network or the marginal gain in fit obtained 122 | #' by addition of each edge. 123 | #' 124 | #' If `type = improvement` a ggplot object is returned. It can be modified like 125 | #' any other ggplot. See the ggplot documentation and the examples in 126 | #' \link{plot.cascade}. 127 | #' 128 | #' @import ggplot2 129 | #' 130 | #' @param x object of class diffnet to be plotted. 131 | #' @param type character, one of \code{c("network", "improvement", "p-value")} 132 | #' indicating if the inferred diffusion network, the 133 | #' improvement for each edge or the p-value from the vuong test for each 134 | #' edge should be visualized . 135 | #' @param ... additional arguments. 136 | #' 137 | #' @examples 138 | #' 139 | #' \dontrun{ 140 | #' data(cascades) 141 | #' res <- netinf(cascades, quiet = TRUE) 142 | #' plot(res, type = "network") 143 | #' plot(res, type = "improvement") 144 | #' plot(res, type = "p-value") 145 | #' } 146 | #' 147 | #' @return A ggplot plot object if \code{type = "improvement"} otherwise an 148 | #' igraph plot. 149 | #' @export 150 | plot.diffnet <- function(x, type = "network", ...) { 151 | # Check inputs 152 | type <- match.arg(type, c("network", "improvement", "p-value")) 153 | 154 | if(type == "network") { 155 | if (requireNamespace("igraph", quietly = TRUE)) { 156 | # Plot network 157 | g <- igraph::graph_from_data_frame(d = x[, 1:2]) 158 | igraph::plot.igraph(g, edge.arrow.size=.3, vertex.color = "grey70") 159 | } else { 160 | stop("In order to use this functionality the `igraph` package needs to be installed. Run `install.packages('igraph')` and retry.") 161 | } 162 | } 163 | else if(type == "improvement") { 164 | ggplot(x) + 165 | geom_line(aes_string(x=c(1:nrow(x)), y = "improvement"), 166 | color = "grey80", size = 0.5) + 167 | geom_point(aes_string(x=c(1:nrow(x)), y = "improvement"), 168 | size = 0.5) + 169 | xlab("Edge Number") + ylab("Improvement") + 170 | PLOT_THEME_() 171 | } else { 172 | ggplot(x) + 173 | geom_line(aes_string(x=c(1:nrow(x)), y = "p_value"), 174 | color = "grey80", size = 0.5) + 175 | geom_point(aes_string(x=c(1:nrow(x)), y = "p_value"), 176 | size = 0.5) + 177 | xlab("Edge Number") + ylab("P-Value") + 178 | PLOT_THEME_() 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /R/netinf.R: -------------------------------------------------------------------------------- 1 | #' Infer latent diffusion network 2 | #' 3 | #' Infer a network of diffusion ties from a set of cascades. Each cascade 4 | #' is defined by pairs of node ids and infection times. 5 | #' 6 | #' The algorithm is describe in detail in Gomez-Rodriguez et al. (2010). 7 | #' Additional information can be found on the 8 | #' netinf website (\url{http://snap.stanford.edu/netinf/}). 9 | #' 10 | #' \itemize{ 11 | #' \item Exponential distribution: \code{trans_mod = "exponential"}, 12 | #' \code{params = c(lambda)}. 13 | #' Parametrization: \eqn{\lambda e^{-\lambda x}}. 14 | #' \item Rayleigh distribution: \code{trans_mod = "rayleigh"}, 15 | #' \code{params = c(alpha)}. 16 | #' Parametrization: \eqn{\frac{x}{\alpha^2} \frac{e^{-x^2}}{2\alpha^2}}. 17 | #' \item Log-normal distribution: \code{trans_mod = "log-normal"}, 18 | #' \code{params = c(mu, sigma)}. 19 | #' Parametrization: \eqn{\frac{1}{x\sigma\sqrt{2\pi}}e^{-\frac{(ln x - \mu)^2}{2\sigma^2}}}. 20 | #' } 21 | #' 22 | #' If higher performance is required and for very large data sets, a faster pure C++ 23 | #' implementation is available in the Stanford Network Analysis Project (SNAP). 24 | #' The software can be downloaded at \url{http://snap.stanford.edu/netinf/}. 25 | #' 26 | #' @import checkmate 27 | #' @import assertthat 28 | #' 29 | #' @param cascades an object of class cascade containing node and cascade 30 | #' information. See \code{\link{as_cascade_long}} and 31 | #' \code{\link{as_cascade_wide}} for details. 32 | #' @param trans_mod character, indicating the choice of model: 33 | #' \code{"exponential"}, \code{"rayleigh"} or \code{"log-normal"}. 34 | #' @param params numeric, Parameters for diffusion model. If left unspecified 35 | #' reasonable parameters are inferred from the data. See details for how to 36 | #' specify parameters for the different distributions. 37 | #' @param n_edges integer, number of edges to infer. Leave unspecified if using 38 | #' \code{p_value_cutoff}. 39 | #' @param p_value_cutoff numeric, in the interval (0, 1). If 40 | #' specified, edges are inferred in each iteration until the Vuong test for 41 | #' edge addition reaches the p-value cutoff or when the maximum 42 | #' possible number of edges is reached. Leave unspecified if using 43 | #' \code{n_edges} to explicitly specify number of edges to infer. 44 | #' @param quiet logical, Should output on progress by suppressed. 45 | #' @param trees logical, Should the inferred cascade trees be returned. Note, 46 | #' that this will lead to a different the structure of the function output. 47 | #' See section Value for details. 48 | #' 49 | #' @return Returns the inferred diffusion network as an edgelist in an object of 50 | #' class \code{diffnet} and \code{\link[base]{data.frame}}. The first 51 | #' column contains the sender, the second column the receiver node. The 52 | #' third column contains the improvement in fit from adding the edge that is 53 | #' represented by the row. The output additionally has the following 54 | #' attributes: 55 | #' \itemize{ 56 | #' \item \code{"diffusion_model"}: The diffusion model used to infer the 57 | #' diffusion network. 58 | #' \item \code{"diffusion_model_parameters"}: The parameters for the 59 | #' model that have been inferred by the approximate profile MLE 60 | #' procedure. 61 | #' } 62 | #' If the argument \code{trees} is set to \code{TRUE}, the output is a list 63 | #' with the first element being the \code{data.frame} described above, and 64 | #' the second element being the trees in edge-list form in a single 65 | #' \code{data.frame}. 66 | #' 67 | #' @references 68 | #' M. Gomez-Rodriguez, J. Leskovec, A. Krause. Inferring Networks of Diffusion 69 | #' and Influence.The 16th ACM SIGKDD Conference on Knowledge Discovery and 70 | #' Data Mining (KDD), 2010. 71 | #' 72 | #' @examples 73 | #' 74 | #' # Data already in cascades format: 75 | #' data(cascades) 76 | #' out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1) 77 | #' 78 | #' # Starting with a dataframe 79 | #' df <- simulate_rnd_cascades(10, n_nodes = 20) 80 | #' cascades2 <- as_cascade_long(df, node_names = unique(df$node_name)) 81 | #' out <- netinf(cascades2, trans_mod = "exponential", n_edges = 5, params = 1) 82 | #' 83 | #' @export 84 | netinf <- function(cascades, trans_mod = "exponential", n_edges = NULL, 85 | p_value_cutoff = NULL, params = NULL, quiet = FALSE, 86 | trees = FALSE) { 87 | 88 | # Check inputs 89 | assert_that(class(cascades)[1] == "cascade") 90 | qassert(trans_mod, "S1") 91 | if(is.null(n_edges) & is.null(p_value_cutoff)) { 92 | stop('Please specify either `n_edges` or `p_value_cutoff`.') 93 | } 94 | if(!is.null(n_edges) & !is.null(p_value_cutoff)) { 95 | stop('Please only specify either `n_edges` or `p_value_cutoff`.') 96 | } 97 | if(!is.null(n_edges)) { 98 | qassert(n_edges, "X1[1,)") 99 | auto_edges <- FALSE 100 | cutoff <- 0 # Not used 101 | } else { 102 | qassert(p_value_cutoff, "R1(0,1]") 103 | auto_edges <- TRUE 104 | cutoff <- p_value_cutoff 105 | n_edges <- 0 # Not used since n_edges inferred form cutoff 106 | } 107 | 108 | model <- match.arg(trans_mod, c("exponential", "rayleigh", 'log-normal')) 109 | 110 | if(!is.null(params)) { 111 | if(model == "exponential" | model == "rayleigh") { 112 | qassert(params, "N1") 113 | } else if(model == "log-normal") { 114 | qassert(params[1], "N1") 115 | qassert(params[2], "N1(0,)") 116 | } 117 | } 118 | 119 | # Assign integer node ids 120 | # Note that the ids start at 0 (c++ is 0 indexed) 121 | node_ids <- c(0:(length(cascades$node_names) - 1)) 122 | names(node_ids) <- cascades$node_names 123 | 124 | # Transform node ids in cascades to integer ids 125 | cascade_nodes <- lapply(cascades$cascade_nodes, function(x) node_ids[x]) 126 | 127 | # Initialize parameters 128 | if(is.null(params)) { 129 | max_times <- do.call(c, lapply(cascades$cascade_times, 130 | function(x) x[-1] - x[1])) 131 | min_times <- do.call(c, lapply(cascades$cascade_times, 132 | function(x) x[-1] - x[-length(x)])) 133 | if(model == "exponential") { 134 | lambda_min <- 1 / mean(max_times, na.rm = T) 135 | lambda_max <- 1 / mean(min_times, na.rm = T) 136 | params <- mean(c(lambda_max, lambda_min)) 137 | } else if(model == "rayleigh") { 138 | N <- length(max_times) 139 | sh_max <- sqrt(sum(max_times^2) / 2 * N) 140 | sh_min <- sqrt(sum(min_times^2) / 2 * N) 141 | adjustment <- exp(lgamma(N) + log(sqrt(N))) / exp(lgamma(N + 1 / 2)) 142 | params <- mean(c(sh_max * adjustment, sh_min * adjustment)) 143 | } else if(model == "log-normal") { 144 | mean_max <- mean(log(max_times[max_times != 0])) 145 | mean_min <- mean(log(min_times[min_times != 0])) 146 | sigma_max <- sqrt(stats::var(log(max_times[max_times != 0]))) 147 | sigma_min <- sqrt(stats::var(log(min_times[min_times != 0]))) 148 | params <- c(mean(mean_max, mean_min), mean(sigma_max, sigma_min)) 149 | } 150 | qassert(params, "R") 151 | if(!quiet) cat('Initialized parameters with: ', params, '\n') 152 | } 153 | 154 | # Run netinf 155 | netinf_out <- netinf_(cascade_nodes = cascade_nodes, 156 | cascade_times = cascades$cascade_times, 157 | model = model, params = params, n_edges = n_edges, 158 | quiet = quiet, auto_edges = auto_edges, 159 | cutoff = cutoff) 160 | 161 | 162 | network <- as.data.frame(cbind(do.call(rbind, netinf_out[[1]]), 163 | netinf_out[[2]]), stringsAsFactors = FALSE) 164 | 165 | 166 | ## Replace integer node_ids with node_names 167 | ### In the edgelist 168 | network[, 1] <- cascades$node_names[(network[, 1] + 1)] 169 | network[, 2] <- cascades$node_names[(network[, 2] + 1)] 170 | colnames(network) <- c("origin_node", "destination_node", "improvement") 171 | network$p_value <- netinf_out[[4]] 172 | class(network) <- c("diffnet", "data.frame") 173 | attr(network, "diffusion_model") = model 174 | attr(network, "diffusion_model_parameters") = params 175 | 176 | if(trees) { 177 | tree_dfs <- lapply(1:length(netinf_out[[3]]), function(i) { 178 | x <- netinf_out[[3]][[i]] 179 | out <- as.data.frame(cbind(x[[1]], x[[2]], rep(i, length(x[[1]]))))} 180 | ) 181 | trees_df <- do.call(rbind, tree_dfs) 182 | 183 | # Replace int node ids with node_names 184 | trees_df$child <- do.call(c, cascades$cascade_nodes) 185 | trees_df <- trees_df[!is.na(trees_df[, 2]), ] 186 | trees_df[, 1] <- cascades$node_names[(trees_df[, 1] + 1)] 187 | casc_names <- names(cascades$cascade_nodes) 188 | trees_df[, 3] <- casc_names[trees_df[, 3]] 189 | colnames(trees_df) <- c("parent", "log_score", "cascade_id", "child") 190 | trees_df <- trees_df[, c(1, 4, 2, 3)] 191 | return(list('network' = network, 'trees' = trees_df)) 192 | } 193 | else return(network) 194 | } 195 | 196 | 197 | #' Is the object of class diffnet? 198 | #' 199 | #' Tests if an object is of class diffnet. The class diffnet is appended to the 200 | #' object returned by \code{\link{netinf}} for dispatch of appropriate plotting 201 | #' methods. 202 | #' 203 | #' @param object the object to be tested. 204 | #' 205 | #' @return \code{TRUE} if object is a diffnet, \code{FALSE} otherwise. 206 | #' 207 | #' @examples 208 | #' 209 | #' data(cascades) 210 | #' result <- netinf(cascades, n_edges = 6, params = 1) 211 | #' is.diffnet(result) 212 | #' @export 213 | is.diffnet <- function(object) { 214 | inherits(object, "diffnet") 215 | } 216 | 217 | 218 | #' Count the number of possible edges in the dataset 219 | #' 220 | #' Across all cascades, count the edges that are possible. An edge from node 221 | #' \code{u} to node \code{v} 222 | #' is only possible if in at least one cascade \code{u} experienced an event 223 | #' before \code{v}. 224 | #' 225 | #' @param cascades Object of class cascade containing the data. 226 | #' 227 | #' @return An integer count. 228 | #' 229 | #' @examples 230 | #' data(cascades) 231 | #' count_possible_edges(cascades) 232 | #' 233 | #' @export 234 | count_possible_edges <- function(cascades) { 235 | # Check inputs 236 | assert_that(is.cascade(cascades)) 237 | 238 | # Assign integer node ids 239 | # Note that the ids start at 0 (c++ is 0 indexed) 240 | node_ids <- c(0:(length(cascades$node_names) - 1)) 241 | names(node_ids) <- cascades$node_names 242 | 243 | # Transform node ids in cascades to integer ids 244 | cascade_nodes <- lapply(cascades$cascade_nodes, function(x) node_ids[x]) 245 | 246 | n <- count_possible_edges_(cascade_nodes, cascades$cascade_times) 247 | return(n) 248 | } -------------------------------------------------------------------------------- /src/netinf.cpp: -------------------------------------------------------------------------------- 1 | // [[Rcpp::depends(RcppProgress)]] 2 | #include 3 | #include 4 | #include 5 | #include "netinf_utilities.h" 6 | #include "vuong_test.h" 7 | #include "possible_edges.h" 8 | #include "spanning_tree.h" 9 | #include "netinf.h" 10 | #include "distributions.h" 11 | 12 | using namespace Rcpp; 13 | 14 | typedef edge_map::iterator m_iter; 15 | typedef edge_map::reverse_iterator rm_iter; 16 | 17 | // [[Rcpp::export]] 18 | List netinf_(List &cascade_nodes, List &cascade_times, int &n_edges, 19 | std::string &model, NumericVector ¶ms, 20 | bool quiet, bool &auto_edges, double &cutoff) { 21 | 22 | // Prepare the trees of each cascade (find the optimal spanning tree and 23 | // store parents for each node and respective scores) 24 | if(!quiet) Rcout << "Initializing trees...\n"; 25 | List trees_data = initialize_trees(cascade_nodes, cascade_times, params, 26 | model); 27 | List trees = trees_data[0]; 28 | NumericVector tree_scores = trees_data[1]; 29 | 30 | // Get edges that are possible given the cascade data 31 | edge_map possible_edges = get_possible_edges_(cascade_nodes, cascade_times, 32 | quiet); 33 | 34 | // Output containers 35 | int n_p_edges = possible_edges.size(); 36 | if(auto_edges) n_edges = n_p_edges; 37 | List edges; 38 | NumericVector scores; 39 | NumericVector p_values; 40 | 41 | if(n_edges > n_p_edges) { 42 | std::string msg = "Argument `n_edges` exceeds the maximal number of possible edges (which is " + 43 | std::to_string(n_p_edges) + ").\n"; 44 | throw std::invalid_argument(msg); 45 | } 46 | 47 | if(!quiet) { 48 | if(auto_edges) Rcout << "Inferring edges using p-value cutoff...\n"; 49 | else Rcout << "Inferring " << n_edges << " edges...\n"; 50 | } 51 | 52 | // Set up for timing first iteration and progress bar (if not auto edges) 53 | bool show_progress = true; 54 | if(quiet) show_progress = false; 55 | if(auto_edges) show_progress = false; 56 | Progress p(n_edges, show_progress); 57 | 58 | int e; 59 | int check_interval = (n_p_edges / 10) + 1; 60 | id_array previous_best_edge = {{-1, -1}}; 61 | NumericVector improvements(n_p_edges); 62 | 63 | for(e = 0; e < n_edges; e++) { 64 | 65 | m_iter start_iter = possible_edges.begin(); 66 | id_array end_id = possible_edges.rbegin()->first; 67 | if(e > 0) { 68 | // Find the first edge in possible_edges that has the same child as 69 | // previous_best_edge by iterating back from previous_best_edge 70 | m_iter it_best = possible_edges.find(previous_best_edge); 71 | int current_child = previous_best_edge[0]; 72 | id_array last_key = it_best->first; 73 | for(m_iter rit = it_best; rit->first[0] == current_child; rit--) { 74 | last_key = rit->first; 75 | if(rit == possible_edges.begin()) break; 76 | } 77 | 78 | // And store the iterator as start point for edge inference 79 | start_iter = possible_edges.find(last_key); 80 | 81 | // Find the last edge in possible_edges with the same child as 82 | // previous_best_edges 83 | for(m_iter rit = it_best; rit != possible_edges.end(); rit++) { 84 | if(rit->first[0] != current_child) break; 85 | last_key = rit->first; 86 | } 87 | // And store the iterator as start point for edge inference 88 | end_id = possible_edges.find(last_key)->first; 89 | } else { 90 | // In the first iteration we have to check every edge 91 | } 92 | 93 | int i = 0; 94 | for (m_iter x=start_iter; x!=possible_edges.end(); x++) { 95 | 96 | // Skip edge inferred in last iteration 97 | if(x->first == previous_best_edge) continue; 98 | 99 | //potential parent 100 | int parent = x->first[1]; 101 | // infected node 102 | int child = x->first[0]; 103 | 104 | //find replacements for u->v edge 105 | List edge_replacements = tree_replacement(parent, child, 106 | possible_edges, 107 | cascade_times, 108 | cascade_nodes, trees, 109 | model, params); 110 | // Store the updated potential improvement value for this edge 111 | x->second.second = edge_replacements[0]; 112 | 113 | // Check for user interrupt and update progress bar 114 | if((i % check_interval) == 0) { 115 | checkUserInterrupt(); 116 | } 117 | i += 1; 118 | 119 | // Stop when last edge to check is reached 120 | if(x->first == end_id) break; 121 | } 122 | 123 | // Erase the previous best edge from possible edges 124 | possible_edges.erase(previous_best_edge); 125 | 126 | // Check all improvements to find the best edge 127 | double max_improvement = 0; 128 | id_array best_edge; 129 | for(m_iter x = possible_edges.begin(); x != possible_edges.end(); x++) { 130 | if(x->second.second >= max_improvement) { 131 | max_improvement = x->second.second; 132 | best_edge = x->first; 133 | } 134 | } 135 | // Re calculate the replacement data for the best edge 136 | List best_edge_replacement_data = tree_replacement(best_edge[1], 137 | best_edge[0], 138 | possible_edges, 139 | cascade_times, 140 | cascade_nodes, trees, 141 | model, params); 142 | 143 | // Store the best results 144 | // Put edge in order parent->child for backwards compatibility 145 | id_array best_edge_out = {{best_edge[1], best_edge[0]}}; 146 | edges.push_back(best_edge_out); 147 | scores.push_back(max_improvement); 148 | 149 | // Update the trees with the new edge 150 | NumericVector old_tree_scores = copy_vector(tree_scores); 151 | update_trees(trees, tree_scores, best_edge_replacement_data, 152 | cascade_nodes, best_edge); 153 | 154 | // Test if the edge improves fit 155 | double p_value = vuong_test(old_tree_scores, tree_scores); 156 | p_values.push_back(p_value); 157 | 158 | // Store the best edge for this iteration to inform what to iterate over 159 | // for the next edge 160 | previous_best_edge = best_edge; 161 | 162 | if(!auto_edges & !quiet) p.increment(); 163 | 164 | if(!quiet & auto_edges){ 165 | Rcout << "\r" << (e+1) << " edges inferred. P-value: " << 166 | p_value << std::flush; 167 | } 168 | 169 | if(auto_edges & (p_value >= cutoff)) { 170 | if(!quiet) Rcout << "\nReached p-value cutoff. Stopping.\n"; 171 | break; 172 | } 173 | } 174 | 175 | // Write out message if maximum number of edges has been reach below cutoff 176 | if(auto_edges & (e == n_edges)) { 177 | if(!quiet) Rcout << "Reached maximum number of possible edges" << 178 | " before p-value cutoff.\n"; 179 | } 180 | List out = List::create(edges, scores, trees, p_values); 181 | return out; 182 | } 183 | 184 | List tree_replacement(int &parent, int &child, edge_map &possible_edges, 185 | List &cascade_times, List &cascade_nodes, 186 | List &trees, std::string &model, NumericVector ¶ms) { 187 | 188 | // Get the cascades the edge is possible in: 189 | std::array pair_id = {{child, parent}}; 190 | std::vector cascades = possible_edges.find(pair_id)->second.first; 191 | int n_possible_cascades = cascades.size(); 192 | 193 | // Initialize output containers 194 | IntegerVector cascades_with_replacement(n_possible_cascades, -1); 195 | NumericVector replacement_scores(n_possible_cascades, NA_REAL); 196 | 197 | // Total improvement achieved by this edge across all trees 198 | double improvement = 0; 199 | for(int c = 0; c < cascades.size(); c++) { 200 | 201 | // Get pointers to the data of current cascade 202 | int this_cascade = cascades[c]; 203 | IntegerVector this_cascade_nodes = cascade_nodes[this_cascade]; 204 | NumericVector this_cascade_times = cascade_times[this_cascade]; 205 | 206 | // Get the event time for u and v in current cascade 207 | int idx_parent = get_index(this_cascade_nodes, parent); 208 | int idx_child = get_index(this_cascade_nodes, child); 209 | double timing_parent = this_cascade_times[idx_parent]; 210 | double timing_child = this_cascade_times[idx_child]; 211 | 212 | // extract score associated with the current parent of child 213 | List this_tree = trees[this_cascade]; 214 | NumericVector scores = this_tree[1]; 215 | double current_score = scores[idx_child]; 216 | 217 | // what would the score be with the propspective parent (u) 218 | double replacement_score = edge_score(timing_parent, timing_child, 219 | model, params, true); 220 | 221 | // If the edge has a higher score add it to overall improvement and 222 | // store the cascade the improvement occured in (and the new score) 223 | if(replacement_score > current_score) { 224 | improvement += replacement_score - current_score; 225 | cascades_with_replacement[c] = this_cascade; 226 | replacement_scores[c] = replacement_score; 227 | } 228 | } 229 | 230 | 231 | List out = List::create(improvement, cascades_with_replacement, 232 | replacement_scores); 233 | return out; 234 | } 235 | 236 | void update_trees(List &trees, NumericVector &tree_scores, 237 | List &replacement_data, List &cascade_nodes, 238 | id_array best_edge) { 239 | 240 | IntegerVector updated_cascades = replacement_data[1]; 241 | NumericVector replacement_scores = replacement_data[2]; 242 | NumericVector old_tree_scores = copy_vector(tree_scores); 243 | 244 | // Get u and v of best edge 245 | int parent = best_edge[1]; 246 | int child = best_edge[0]; 247 | for(int i = 0; i < updated_cascades.size(); i++) { 248 | int this_cascade = updated_cascades[i]; 249 | if(this_cascade < 0) continue; 250 | IntegerVector this_cascade_nodes = cascade_nodes[this_cascade]; 251 | int idx_child = get_index(this_cascade_nodes, child); 252 | List casc_tree = trees[this_cascade]; 253 | 254 | IntegerVector this_parents = casc_tree[0]; 255 | NumericVector this_scores = casc_tree[1]; 256 | 257 | //update parent id for v 258 | this_parents[idx_child] = parent; 259 | // update branch score 260 | this_scores[idx_child] = replacement_scores[i]; 261 | tree_scores[this_cascade] = sum_vector(this_scores); 262 | } 263 | } 264 | -------------------------------------------------------------------------------- /R/simulate_cascades.R: -------------------------------------------------------------------------------- 1 | #' Simulate a set of random cascades 2 | #' 3 | #' Simulate random cascades, for testing and demonstration purposes. No actual 4 | #' diffusion model is underlying these cascades. 5 | #' 6 | #' @importFrom stats runif 7 | #' 8 | #' @param n_cascades Number of cascades to generate. 9 | #' @param n_nodes Number of nodes in the system. 10 | #' 11 | #' @return A data frame containing (in order of columns) node ids, 12 | #' event time and cascade identifier. 13 | #' 14 | #' @examples 15 | #' 16 | #' df <- simulate_rnd_cascades(10, n_nodes = 20) 17 | #' head(df) 18 | #' 19 | #' @export 20 | simulate_rnd_cascades <- function(n_cascades, n_nodes) { 21 | qassert(n_cascades, "X1[1,)") 22 | 23 | make_cascade_ <- function(cid) { 24 | n <- runif(1, 1, n_nodes) 25 | ids <- as.character(sample(1:n_nodes, n, replace = FALSE)) 26 | times <- sort(runif(n, 0, 30), decreasing = TRUE) 27 | return(data.frame(ids, times, rep(cid, n), stringsAsFactors = FALSE)) 28 | } 29 | 30 | cascades <- do.call(rbind, lapply(sample(c(1:n_cascades), n_cascades, 31 | replace = FALSE), make_cascade_)) 32 | colnames(cascades) <- c("node_name", "event_time", "cascade_id") 33 | rownames(cascades) <- as.character(c(1:nrow(cascades))) 34 | return(cascades) 35 | } 36 | 37 | 38 | #' Simulate cascades from a diffusion network 39 | #' 40 | #' Simulate diffusion cascades based on the generative model underlying netinf 41 | #' and a diffusion network. 42 | #' 43 | #' @import assertthat 44 | #' 45 | #' @param diffnet object of class \code{diffnet}. 46 | #' @param nsim integer, number of cascades to simulate. 47 | #' @param max_time numeric, the maximum time after which observations are 48 | #' censored 49 | #' @param start_probabilities a vector of probabilities for each node in diffnet, 50 | #' to be the node with the first event. If \code{NULL} a node is drawn from 51 | #' a uniform distribution over all nodes. 52 | #' @param partial_cascade object of type cascade, containing one partial 53 | #' cascades for which further development should be simulated. 54 | #' @param params numeric, (optional) parameters for diffusion time distribution. 55 | #' See the details section of \code{\link{netinf}} for specification details. 56 | #' Only use this argument if parameters different from those contained in the 57 | #' \code{diffnet} object should be used or the network is not an object of 58 | #' class \code{diffnet}. 59 | #' @param model character, diffusion model to use. One of \code{c("exponential", 60 | #' "rayleigh", "log-normal")}. Only use this argument if parameters different 61 | #' from those contained in the \code{diffnet} object should be used or the 62 | #' network is not an object of class \code{diffnet}. 63 | #' @param nodes vector of node ids if different from nodes included in 64 | #' \code{diffnet} 65 | #' 66 | #' @return A data frame with three columns. Containing 1) The names of 67 | #' the nodes (\code{"node_name"}) that experience an event in each cascade, 68 | #' 2) the event time (\code{"event_time"}) of the corresponding node, 69 | #' 3) the cascade identifier \code{"cascade_id"}. 70 | #' 71 | #' @examples 72 | #' 73 | #' data(cascades) 74 | #' out <- netinf(cascades, trans_mod = "exponential", n_edges = 5, params = 1) 75 | #' simulated_cascades <- simulate_cascades(out, nsim = 10) 76 | #' 77 | #' # Simulation from partial cascade 78 | #' 79 | #' @export 80 | simulate_cascades <- function(diffnet, nsim = 1, max_time = Inf, 81 | start_probabilities = NULL, 82 | partial_cascade = NULL, params = NULL, 83 | model = NULL, nodes = NULL) { 84 | # Check inputs 85 | assert_that(is.diffnet(diffnet)) 86 | assert_that(nsim >= 1) 87 | 88 | if(is.null(model)) { 89 | model <- attr(diffnet, "diffusion_model") 90 | } 91 | if(is.null(params)) { 92 | params <- attr(diffnet, "diffusion_model_parameters") 93 | } 94 | model <- match.arg(model, c("exponential", "rayleigh", "log-normal")) 95 | if(model == "rayleigh") { 96 | stop("Rayleigh distribution is not implemented yet. Please choose the exponential or log-normal diffusion model.") 97 | } 98 | if(is.null(nodes)) { 99 | nodes <- unique(c(diffnet$origin_node, diffnet$destination_node)) 100 | if(any(!is.element(partial_cascade$cascade_nodes[[1]], nodes))) { 101 | # TODO: This could be a warning and nodes that are not in the network 102 | # could be dropped from the partial cascade 103 | stop("There are nodes in the partial cascade that are not part of the diffusion network. Dropping these nodes. If these nodes should be included for potential out-of-network diffusion, please provide them via the `nodes` argument.") 104 | #overlap = partial_cascade$cascade_nodes[[1]][ 105 | # is.element(partial_cascade$cascade_nodes[[1]], nodes)] 106 | #partial_cascade = subset_cascade(partial_cascade, overlap) 107 | } 108 | } 109 | n_nodes <- length(nodes) 110 | 111 | if(!is.null(start_probabilities) & !is.null(partial_cascade)) { 112 | stop("Start probabilities are not allowed with partial cascades") 113 | } 114 | 115 | if(is.null(start_probabilities)) { 116 | start_probabilities <- rep(1/n_nodes, n_nodes) 117 | } 118 | # Check start probabilities 119 | qassert(start_probabilities, paste0('N', n_nodes, '[0,1]')) 120 | 121 | # Check partial cascade input 122 | if(!is.null(partial_cascade)) { 123 | assert_that(is.cascade(partial_cascade)) 124 | assert_that(length(partial_cascade$cascade_nodes) == 1) 125 | assert_that(length(partial_cascade$cascade_times) == 1) 126 | see_if(all(is.element(partial_cascade$cascade_nodes[[1]], nodes))) 127 | } 128 | 129 | ## Create adjacency matrix from edgelist (ordered as in nodes, row -> sender) 130 | X_ <- matrix(0, ncol = n_nodes, nrow = n_nodes) 131 | for(k in 1:nrow(diffnet)) { 132 | i <- which(nodes == diffnet[k, 1]) 133 | j <- which(nodes == diffnet[k, 2]) 134 | X_[i, j] <- 1 135 | } 136 | 137 | sim_out <- lapply(X = 1:nsim, FUN = simulate_cascade_, n_nodes = n_nodes, 138 | params = params, max_time = max_time, 139 | model = model, nodes = nodes, X_ = X_, 140 | partial_cascade = partial_cascade, 141 | start_probabilities = start_probabilities) 142 | out <- do.call(rbind, sim_out) 143 | rownames(out) <- NULL 144 | return(out) 145 | } 146 | 147 | 148 | rltruncexp <- function(n, rate, ltrunc) { 149 | stats::qexp(stats::runif(n, min = stats::pexp(ltrunc, rate = rate), 150 | max = 1), rate = rate) 151 | } 152 | 153 | rltrunclnorm <- function(n, meanlog, sdlog, ltrunc) { 154 | stats::qlnorm(stats::runif(n, min = stats::plnorm(ltrunc, meanlog = meanlog, 155 | sdlog = sdlog)), 156 | meanlog = meanlog, sdlog = sdlog) 157 | } 158 | 159 | # This function generates a matrix of relative diffusion times (rows senders, 160 | # columns receivers) where all diffusion times sent by the nodes involved in the 161 | # partial cascade are left truncated to be at least long enough to not infect 162 | # any nodes before the absolute left censoring time (the last infection time 163 | # in the partial cascade). 164 | truncated_rel_diff_times <- function(n_nodes, nodes, partial_cascade, model, 165 | params) { 166 | out <- matrix(stats::rexp(n_nodes^2, rate = params), nrow = n_nodes) 167 | rownames(out) <- colnames(out) <- nodes 168 | 169 | # Get the truncation point for each node in the partial cascade 170 | pc_times <- partial_cascade$cascade_times[[1]] 171 | trunc_points <- pc_times[length(pc_times)] - pc_times 172 | 173 | # Truncated draws for nodes in partial cascade 174 | pc_nodes = partial_cascade$cascade_nodes[[1]] 175 | 176 | if(model == 'exponential') { 177 | out[pc_nodes, ] = t(sapply(trunc_points, 178 | function(x) rltruncexp(n_nodes, params, x))) 179 | } else if(model == "log-normal") { 180 | out[pc_nodes, ] = t(sapply(trunc_points, 181 | function(x) rltrunclnorm(n_nodes, params[1], 182 | params[2], x))) 183 | } 184 | return(out) 185 | } 186 | 187 | # Simulate a single cascade from scratch (random first event node) 188 | simulate_cascade_ <- function(i, nodes, n_nodes, params, max_time, model, X_, 189 | partial_cascade, start_probabilities) { 190 | beta = 0.5 191 | epsilon = 10e-9 192 | 193 | # Generate relative diffusion times for all pairs 194 | if(is.null(partial_cascade)) { 195 | if(model == "exponential") { 196 | rel_diff_times <- matrix(stats::rexp(n_nodes^2, rate = params), 197 | nrow = n_nodes) 198 | } else if(model == "log-normal") { 199 | rel_diff_times <- matrix(stats::rlnorm(n_nodes^2, meanlog = params[1], 200 | sdlog = params[2]), 201 | nrow = n_nodes) 202 | } 203 | rownames(rel_diff_times) <- colnames(rel_diff_times) <- nodes 204 | } else { 205 | rel_diff_times <- truncated_rel_diff_times(n_nodes, nodes, 206 | partial_cascade, model, 207 | params) 208 | } 209 | 210 | 211 | # No diffusion of node to itself 212 | diag(rel_diff_times) <- 0 213 | if(!is.null(partial_cascade)) { 214 | # No diffusion to nodes that already had an event in partial_cascade 215 | idx <- which(is.element(nodes, partial_cascade$cascade_nodes[[1]])) 216 | rel_diff_times[, idx] <- 0 217 | } 218 | # Censor at maximum observation time 219 | rel_diff_times[rel_diff_times > max_time] <- 0 220 | 221 | # Create matrix to transform network for out of network diffusion 222 | norm_epsilon <- epsilon / (epsilon + beta) 223 | Y <- matrix(0, ncol = n_nodes, nrow = n_nodes) 224 | # Set colums to 1 with probability norm_ep 225 | Y[, as.logical(stats::rbinom(n_nodes, 1, prob = norm_epsilon))] <- 1 226 | 227 | # Set relative diffusion times between nodes with no corresponding diffusion 228 | # -edge to 0 229 | rel_diff_times <- (X_ - Y)^2 * rel_diff_times 230 | 231 | if(is.null(partial_cascade)) { 232 | start_nodes <- sample(x = nodes, size = 1, prob = start_probabilities) 233 | } else { 234 | start_nodes <- nodes[idx] 235 | } 236 | # Find shortest path from start node to every other (reachable) node 237 | if (requireNamespace("igraph", quietly = TRUE)) { 238 | g <- igraph::graph.adjacency(rel_diff_times, weighted = TRUE, 239 | mode = "directed") 240 | dists <- igraph::distances(g, v = start_nodes, mode = "out") 241 | # reorder 242 | if(!is.null(partial_cascade)) { 243 | dists <- dists[partial_cascade$cascade_nodes[[1]], , drop = FALSE] 244 | } 245 | 246 | } else { 247 | stop("In order to use this functionality the `igraph` package needs to be installed. Run `install.packages('igraph')` and retry.") 248 | } 249 | 250 | if(is.null(partial_cascade)) { 251 | prev_event_times <- 0 252 | } else { 253 | prev_event_times <- partial_cascade$cascade_times[[1]] 254 | } 255 | # Find the earliest event time for each node 256 | ## Add the event time of each node in partial_cascade to shortest path 257 | ## from this node (or add 0 if no partial cascade) 258 | abs_dists <- t(sapply(1:nrow(dists), 259 | function(j) dists[j, ] + prev_event_times[j])) 260 | d <- apply(abs_dists, 2, min) 261 | d <- d[!is.infinite(d)] 262 | 263 | out <- data.frame("node_name" = names(d), 264 | "event_time" = as.numeric(d), 265 | "cascade_id" = i) 266 | return(out) 267 | } 268 | 269 | -------------------------------------------------------------------------------- /vignettes/tutorial_vignette.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NetworkInference Tutorial: Persistent Policy Diffusion Ties" 3 | author: "Fridolin Linder and Bruce Desmarais" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{NetworkInference Tutorial: Persistent Policy Diffusion Ties} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | 20 | --- 21 | 22 | # Introduction 23 | 24 | --- 25 | 26 | In this tutorial we go through some of the functionality of the `NetworkInference` package using the example application from Desmarais et al. (2015) and extending it with the recently released [State Policy Innovation and Diffusion](https://doi.org/10.7910/DVN/CVYSR7) (SPID) database. In this paper Desmarais et al. infer a latent network for policy diffusion based on adoption of policies in the US states. 27 | 28 | `netinf` infers the optimal diffusion network from a set of *nodes* (in this case US-states) and a number of so called *cascades* (here a cascade corresponds to a policy that is adopted by at least two states). A cascade is a series of events occurring at 29 | a specified time. 30 | 31 | ## Installing the package 32 | 33 | Installing `NetworkInference`: 34 | ```{r, eval=FALSE} 35 | install.packages('NetworkInference') 36 | ``` 37 | 38 | 39 | Some other packages required for this tutorial: 40 | ```{r, eval=FALSE} 41 | install.packages(c('dplyr', 'igraph')) 42 | ``` 43 | 44 | ## Exploring SPID Data 45 | 46 | The policy adoption data is available in the package: 47 | 48 | ```{r, message=FALSE} 49 | library(NetworkInference) 50 | # Load the `policies` dataset (?policies for details). 51 | data('policies') 52 | ``` 53 | 54 | This loads two `data.frame`-objects. `policies` contains the adoption events and `policies_metadata` contains additional information on each policy. 55 | 56 | ```{r, eval=FALSE} 57 | head(policies) 58 | ``` 59 | 60 | ```{r, results="asis", echo=FALSE} 61 | pander::pandoc.table(head(policies)) 62 | ``` 63 | 64 | The first column (`statenam`) gives the state that adopted the `policy` in year `adopt_year`. Let's take a closer look at the data. 65 | 66 | Number of policies: 67 | ```{r} 68 | length(unique(policies$policy)) 69 | ``` 70 | Number of adoption events: 71 | ```{r} 72 | nrow(policies) 73 | ``` 74 | Some example policies: 75 | ```{r} 76 | unique(policies$policy)[1:5] 77 | ``` 78 | 79 | Not all the policy abbreviations are understandable. So let's check the metadata for more information: 80 | ```{r, eval=FALSE} 81 | library(dplyr) 82 | filter(policies_metadata, policy %in% unique(policy)[100:104]) %>% 83 | select(-source) 84 | ``` 85 | ```{r, results="asis", echo=FALSE, message=FALSE} 86 | library(dplyr) 87 | pander::pandoc.table(filter(policies_metadata, policy %in% unique(policy)[100:104]) %>% 88 | select(-source)) 89 | ``` 90 | 91 | 92 | ## Preparing the Data for NetworkInference 93 | 94 | Most functionality of the `NetworkInference` package is based on the `cascades` 95 | data format. So before starting with the analysis we have to transform our data 96 | to such an object. 97 | 98 | ```{r} 99 | policy_cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 100 | event_time = 'adopt_year', 101 | cascade_id = 'policy') 102 | ``` 103 | 104 | In this case we used the function `as_cascade_long`. If your data is in wide format you can convert it using the function `as_cascade_wide`. 105 | 106 | The `cascade` class contains the same data as the `policies` `data.frame`, just in a different format. You don't need to understand how the object is constructed but let's take a look for clarity: 107 | 108 | ```{r} 109 | class(policy_cascades) 110 | length(policy_cascades) 111 | names(policy_cascades) 112 | ``` 113 | 114 | The `cascade` class is basically a list containing three elements: 115 | 116 | ```{r} 117 | policy_cascades$cascade_nodes[2:3] 118 | ``` 119 | 120 | ```{r} 121 | policy_cascades$cascade_times[2:3] 122 | ``` 123 | 124 | There are a few convenience functions to manipulate the cascades (but you can also manipulate the data before converting it to the `cascade` format). 125 | 126 | ### Subsetting by Cascade 127 | ```{r} 128 | selected_policies <- subset_cascade(cascade = policy_cascades, 129 | selection = c('clinic_access', 'cogrowman')) 130 | selected_policies[1:2] 131 | ``` 132 | 133 | ## Subsetting in Time 134 | 135 | ```{r} 136 | time_constrained <- subset_cascade_time(cascade = selected_policies, 137 | start_time = 1990, end_time = 2000) 138 | time_constrained[1:2] 139 | ``` 140 | 141 | ## Removing Nodes (States) 142 | 143 | ```{r} 144 | less_nodes <- drop_nodes(cascades = time_constrained, 145 | nodes = c('Maryland', 'Washington')) 146 | less_nodes[1:2] 147 | ``` 148 | 149 | ### Visually Inspecting Cascades 150 | 151 | It's always good practice to visually inspect the data before working with it. 152 | The `NetworkInference` package provides functionality to visualize the cascade 153 | data. 154 | 155 | The function `summary.cascades()` provides quick summary statistics on the 156 | cascade data: 157 | 158 | ```{r} 159 | summary(policy_cascades) 160 | ``` 161 | 162 | The first four lines provide the number of cascades, the number of nodes in the 163 | system, the number of nodes involved in cascades (there might be states that we 164 | don't have diffusion data on, but we still want them represented in the dataset) 165 | and the possible number of edges in a potential diffusion network (a diffusion edge 166 | between nodes `u` and `v` only makes sense if there is at least one cascade in 167 | which `u` experiences an event before `v`). In this example there are 187 168 | policies and 50 states. Each state is involved in at least one policy cascade 169 | and a fully connected diffusion network would have 2450 edges. 170 | 171 | It also provides summary statistics on the distribution of the cascade lengths 172 | (number of nodes involved in each cascade) and the number of ties in the 173 | cascades (two nodes experiencing the same event at the same time). For our 174 | example, we can see that the 'smallest' policy was adopted by 10 states and the 175 | 'largest' by all 50 states. From the tie summaries we see that there is at least 176 | one policy that was adopted by 45 states in the same year. 177 | 178 | The `plot()` method allows to plot cascades with varying degrees of detail. The 179 | argument `label_nodes` (`TRUE/FALSE`) provides node labels which require more space 180 | but provide more detail. The argument `selection` allows to pick a subset of 181 | cascades to visualize in case there are too many to plot. If `label_nodes` is set 182 | to `FALSE` each event is depicted by a dot, which allows to visualize more cascades 183 | simultaneously. 184 | 185 | Let's first look at the visualization with labels. Here we plot two cascades, 186 | selected by their name: 187 | ```{r, fig.align='center', fig.width=7, fig.height=4} 188 | selection <- c('guncontrol_assaultweapon_ba', 'guncontrol_licenses_dealer') 189 | plot(policy_cascades, label_nodes = TRUE, selection = selection) 190 | ``` 191 | 192 | We can also plot more cascades with less detail: 193 | ```{r, fig.align='center', fig.width=7, fig.height=4} 194 | selection <- c('waiting', 'threestrikes', 'unionlimits', 'smokeban', 195 | 'paperterror', 'miglab', 'methpre', 'lott', 'lemon', 'idtheft', 196 | 'harass', 'hatecrime', 'equalpay') 197 | plot(policy_cascades, label_nodes = FALSE, selection = selection) 198 | ``` 199 | 200 | This produces a ['violin plot'](https://en.wikipedia.org/wiki/Violin_plot) for each cascade with the single diffusion events overplotted as dots. As we already saw in the previous visualization, the policy data has a lot of ties (i.e. many states adopted a policy in the same year) which is indicated by the areas of higher density in the violin plot. 201 | 202 | 203 | ## Inferring the Latent Diffusion Network 204 | 205 | The `netinf` algorithm is implemented in the `netinf()` function. The `netinf` inferrs edges based on a diffusion model. That is we assume a parametric model for the diffusion times between edges. Currently three different diffusion models are implemented: The exponential distribution, the rayleigh distribution and the log-normal distribution. The model can be chosen with the `trans_mod` argument (default is the exponential distribution). 206 | 207 | ### Classic Netinf 208 | 209 | In the original implementation the number of edges to infer had to be fixed and chosen by the researcher. If you want to run `netinf` in this classic way you can do so by specifiying all parameters and the number of edges: 210 | 211 | ```{r} 212 | results <- netinf(policy_cascades, trans_mod = "exponential", n_edges = 100, 213 | params = 0.5, quiet = TRUE) 214 | ``` 215 | 216 | The exponential model has only one parameter (lambda or the rate). If there are more parameters the details section in the documentation of the `netinf` function (`?netinf`) has more detail on how to specify parameters and on the specific parametrization used by the implementation. 217 | 218 | `n_edges` specifies how many edges should be inferred. See @gomez2010inferring and @desmarais2015persistent for guidance on choosing this parameter if running netinf in classic mode. If the number of edges is specified manually, it has to be lower than the maximum number of possible edges. An edge `u->v` is only possible if in at least one cascade `u` experiences an event *before* `v`. This means, that the maximum number of edges depends on the data. The function `count_possible_edges()` allows us to compute the maximum number of edges in advance: 219 | 220 | ```{r} 221 | npe <- count_possible_edges(policy_cascades) 222 | npe 223 | ``` 224 | 225 | 226 | ### Automatic Parameter Selection 227 | 228 | With version 1.2.0 `netinf` can be run without manually specifying the number of edges or the parameters of the diffusion model. 229 | 230 | #### Selecting the number of edges automatically 231 | 232 | After each iteration of the netinf algorithm, we check if the edge added significant improvement to the network. This is done via a vuong style test. Given the likelihood score for each cascade conditional on the network inferred so far, we penalize the network with one addional edge and test if the increase in likelihood accross all cascades is significant. The user still has to specify a p-value cut-off. If the p-value of an edge is larger than the specified cut-off the algorithm stops inferring more edges. The cut-off is set via the `p_value_cutoff` argument. 233 | 234 | ```{r} 235 | results <- netinf(policy_cascades, trans_mod = "exponential", 236 | p_value_cutoff = 0.1, params = 0.5, quiet = TRUE) 237 | nrow(results) 238 | ``` 239 | 240 | We see that with a fixed lambda of 0.5 and a p-value cut-off of 0.1 the algorithm inferred 872 edges. 241 | 242 | #### Selecting the parameters of the diffusion model 243 | 244 | The diffusion model parameters can be selected automatically. Setting the `params` argument to `NULL` (default value) makes the `netinf` function initialize the parameters automatically. The parameters are initialized at the midpoint between the MLE of the minimum diffusion times and the MLE of the maximum diffusion times, across all cascades. Edges are then inferred until either the p-value cut-off or a manually specified number of edges (`n_edges`) is reached. 245 | 246 | ```{r} 247 | results <- netinf(policy_cascades, trans_mod = "exponential", 248 | p_value_cutoff = 0.1, quiet = TRUE) 249 | nrow(results) 250 | ``` 251 | 252 | ### Netinf output 253 | 254 | Let's take a look at the output of the algorithm. The output is a dataframe containing the inferred latent network in the form of an edgelist: 255 | 256 | ```{r, eval=FALSE, echo=TRUE} 257 | head(results) 258 | ``` 259 | ```{r, results = "asis", echo=FALSE} 260 | pander::pandoc.table(head(results)) 261 | ``` 262 | 263 | Each row corresponds to a directed edge. The first column indicates the origin node, the second the destination node. The third column displays the gain in model fit from each added edge. The last column displays the p-value from the Vuong test of each edge. There is a generic plot method to inspect the results. If more tweaking is required, the results are a dataframe so it should be easy for the more experienced users to make your own plot. With `type = "improvement"` the improvement from each edge can be plotted: 264 | 265 | ```{r, fig.align='center', fig.width=7, fig.height=4} 266 | plot(results, type = "improvement") 267 | ``` 268 | 269 | We can also quickly check the p-value from the Vuong test associated with each edge addition: 270 | 271 | ```{r, fig.align='center', fig.width=7, fig.height=4} 272 | plot(results, type = 'p-value') 273 | ``` 274 | 275 | In order to produce a quick visualization of the resulting diffusion network we can use the plot method again, this time with `type = "network"`. Note that in order to use this functionality the igraph package has to be installed. 276 | 277 | ```{r, fig.width=7, fig.height=5.5} 278 | #install.packages('igraph') 279 | # For this functionality the igraph package has to be installed 280 | # This code is only executed if the package is found: 281 | if(requireNamespace("igraph", quietly = TRUE)) { 282 | plot(results, type = "network") 283 | } 284 | ``` 285 | 286 | If additional tweaking of the plot is desired, the network can be visualized using `igraph` explicitly. We refer you you to the [igraph documentation](https://CRAN.R-project.org/package=igraph) for details on how to customize the plot. 287 | 288 | ```{r, message=FALSE, eval=FALSE} 289 | if(requireNamespace("igraph", quietly = TRUE)) { 290 | library(igraph) 291 | g <- graph_from_data_frame(d = results[, 1:2]) 292 | plot(g, edge.arrow.size=.3, vertex.color = "grey70") 293 | } 294 | ``` -------------------------------------------------------------------------------- /R/cascade.R: -------------------------------------------------------------------------------- 1 | # Functions related to the cascade class 2 | 3 | #' Is the object of class cascade? 4 | #' 5 | #' @param object the object to be tested. 6 | #' 7 | #' @return \code{TRUE} if object is a cascade, \code{FALSE} otherwise. 8 | #' 9 | #' @examples 10 | #' 11 | #' data(cascades) 12 | #' is.cascade(cascades) 13 | #' # > TRUE 14 | #' is.cascade(1) 15 | #' # > FALSE 16 | #' @export 17 | is.cascade <- function(object) { 18 | inherits(object, "cascade") 19 | } 20 | 21 | 22 | #' Transform long data to cascade 23 | #' 24 | #' Create a cascade object from data in long format. 25 | #' 26 | #' Each row of the data describes one event in the cascade. The data must 27 | #' contain at least three columns: 28 | #' \enumerate{ 29 | #' \item Cascade node name: The identifier of the node that experiences the 30 | #' event. 31 | #' \item Event time: The time when the node experiences the event. Note that 32 | #' if the time column is of class date or any other special time class, 33 | #' it will be converted to an integer with `as.numeric()`. 34 | #' \item Cascade id: The identifier of the cascade that the event pertains to. 35 | #' } 36 | #' The default names for these columns are \code{node_name}, \code{event_time} 37 | #' and \code{cascade_id}. If other names are used in the \code{data} object the 38 | #' names have to be specified in the corresponding arguments (see argument 39 | #' documentation) 40 | #' 41 | #' @import checkmate 42 | #' @import assertthat 43 | #' 44 | #' @param data \link{data.frame}, containing the cascade data 45 | #' with column names corresponding to the arguments provided to 46 | #' \code{cascade_node_names}, \code{event_time} and \code{cascade_id}. 47 | #' @param cascade_node_name character, column name of \code{data} that specifies 48 | #' the node names in the cascade. 49 | #' @param event_time character, column name of \code{data} that specifies the 50 | #' event times for each node involved in a cascade. 51 | #' @param cascade_id character, column name of the cascade identifier. 52 | #' @param node_names character, factor or numeric vector containing the names for each node. 53 | #' Optional. If not provided, node names are inferred from the cascade data. 54 | #' 55 | #' @return An object of class \code{cascade}. This is a list containing three 56 | #' (named) elements: 57 | #' \enumerate{ 58 | #' \item \code{"node_names"} A character vector of node names. 59 | #' \item \code{"cascade_nodes"} A list with one character vector per 60 | #' cascade containing the node names in order of the events. 61 | #' \item \code{"cascade_times"} A list with one element per cascade 62 | #' containing the event times for the nodes in \code{"cascade_names"}. 63 | #' } 64 | #' 65 | #' @examples 66 | #' 67 | #' df <- simulate_rnd_cascades(10, n_nodes = 20) 68 | #' cascades <- as_cascade_long(df) 69 | #' is.cascade(cascades) 70 | # 71 | #' @export 72 | as_cascade_long <- function(data, cascade_node_name = "node_name", 73 | event_time = "event_time", 74 | cascade_id = "cascade_id", node_names = NULL) { 75 | # Check all inputs 76 | data <- as.data.frame(data) 77 | if(is.null(node_names)) { 78 | node_names <- as.character(unique(data[, cascade_node_name])) 79 | } 80 | qassert(cascade_node_name, 'S1') 81 | qassert(event_time, 'S1') 82 | qassert(cascade_id, 'S1') 83 | assert_that(is.element(cascade_node_name, colnames(data))) 84 | assert_that(is.element(event_time, colnames(data))) 85 | assert_that(is.element(cascade_id, colnames(data))) 86 | assert_data_frame(data, min.rows = 1, min.cols = 3) 87 | 88 | # Transform the data 89 | ## Transform cascade ids and node names to character to get consistency 90 | ## down the line 91 | data[, cascade_node_name] <- as.character(data[, cascade_node_name]) 92 | data[, cascade_id] <- as.character(data[, cascade_id]) 93 | 94 | ## Transform to cascade data structure 95 | splt <- split(data, f = data[, cascade_id]) 96 | cascade_nodes <- lapply(splt, function(x) x[, cascade_node_name]) 97 | cascade_times <- lapply(splt, function(x) as.numeric(x[, event_time])) 98 | cascade_times <- lapply(cascade_times, as.numeric) 99 | names(cascade_nodes) <- names(splt) 100 | names(cascade_times) <- names(splt) 101 | 102 | # Check if data is consistent 103 | assert_cascade_consistency_(cascade_nodes, cascade_times, node_names) 104 | 105 | out <- list("cascade_nodes" = cascade_nodes, 106 | "cascade_times" = cascade_times, 107 | "node_names" = node_names) 108 | class(out) <- c("cascade", "list") 109 | 110 | out <- order_cascade_(out) 111 | 112 | return(out) 113 | 114 | } 115 | 116 | 117 | #' Transform wide data to cascade 118 | #' 119 | #' Create a cascade object from data in wide format. 120 | #' 121 | #' If data is in wide format, each row corresponds to a node and each column to 122 | #' a cascade. Each cell indicates the event time for a node - cascade 123 | #' combination. If a node did not experience an event for a cascade (the node 124 | #' is censored) the cell entry must be \code{NA}. 125 | #' 126 | #' @import checkmate 127 | #' @import assertthat 128 | #' 129 | #' @param data \link{data.frame} or \link{matrix}, rows corresponding to nodes, 130 | #' columns to cascades. Matrix entries are the event times for each node, 131 | #' cascade pair. Missing values indicate censored observations, that is, 132 | #' nodes that did not have an event). Specify column and row names if 133 | #' cascade and node ids other than integer sequences are desired. Note that, 134 | #' if the time column is of class date or any other special time class, it 135 | #' will be converted to an integer with `as.numeric()`. 136 | #' @param node_names character, factor or numeric vector, containing names for each node. 137 | #' Optional. If not provided, node names are inferred from the provided data. 138 | #' 139 | #' @return An object of class \code{cascade}. This is a list containing three 140 | #' (named) elements: 141 | #' \enumerate{ 142 | #' \item \code{"node_names"} A character vector of node names. 143 | #' \item \code{"cascade_nodes"} A list with one character vector per 144 | #' cascade containing the node names in order of the events. 145 | #' \item \code{"cascade_times"} A list with one element per cascade 146 | #' containing the event times for the nodes in \code{"cascade_names"}. 147 | #' } 148 | # 149 | #' @examples 150 | #' 151 | #' data("policies") 152 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 153 | #' event_time = 'adopt_year', cascade_id = 'policy') 154 | #' wide_policies = as.matrix(cascades) 155 | #' cascades <- as_cascade_wide(wide_policies) 156 | #' is.cascade(cascades) 157 | #' 158 | #' @export 159 | #' 160 | as_cascade_wide <- function(data, node_names = NULL) { 161 | 162 | # Check all inputs 163 | if(is.null(node_names)) { 164 | # Get node names 165 | if(is.null(rownames(data))) { 166 | msg <- paste("No rownames provided for data matrix. Assigning integer", 167 | "names to nodes.\n") 168 | warning(msg) 169 | node_names <- as.character(c(1:nrow(data))) 170 | } else { 171 | node_names <- rownames(data) 172 | } 173 | } 174 | 175 | assert( 176 | checkClass(data, "data.frame"), 177 | checkClass(data, "matrix") 178 | ) 179 | data <- as.matrix(data) 180 | assert_matrix(data, all.missing = FALSE) 181 | assert_that(length(node_names) == nrow(data)) 182 | 183 | # Transform the data 184 | ## Get cascade ids 185 | if(is.null(colnames(data))) { 186 | msg <- paste("No column names provided for data. Assigning integer names", 187 | "to cascades.\n") 188 | warning(msg) 189 | cascade_ids <- as.character(c(1:ncol(data))) 190 | } else { 191 | cascade_ids <- colnames(data) 192 | } 193 | 194 | ## Transform to cascade data structure 195 | nona_times <- apply(data, 2, clean_casc_vec_, mode = "times", data = data, 196 | node_names = node_names) 197 | nona_nodes <- apply(data, 2, clean_casc_vec_, mode = "nodes", data = data, 198 | node_names = node_names) 199 | # If dim(data)[2] = 1 apply returns vector, if > 1 it returns list. Generate 200 | # equivalent output in both cases: 201 | if(inherits(nona_times, "matrix")) { 202 | nona_times <- list(nona_times) 203 | names(cascade_times) <- colnames(nona_times) 204 | cascade_nodes <- list(as.character(nona_nodes)) 205 | names(cascade_nodes) <- colnames(nona_nodes) 206 | 207 | } else { # already list 208 | cascade_nodes <- nona_nodes 209 | } 210 | 211 | # Check if data is consistent 212 | assert_cascade_consistency_(cascade_nodes, nona_times, node_names) 213 | 214 | out <- list("cascade_nodes" = cascade_nodes, 215 | "cascade_times" = nona_times, 216 | "node_names" = node_names) 217 | class(out) <- c("cascade", "list") 218 | out <- order_cascade_(out) 219 | 220 | return(out) 221 | } 222 | 223 | 224 | # Clean cascade vector (remove nas and sort) 225 | clean_casc_vec_ <- function(x, mode, data, node_names) { 226 | n <- node_names[!is.na(x)] 227 | x <- as.numeric(x[!is.na(x)]) 228 | times <- sort(x, decreasing = TRUE) 229 | n <- n[order(x, decreasing = TRUE)] 230 | names(times) <- NULL 231 | names(n) <- NULL 232 | if(mode == "times") return(times) 233 | else return(n) 234 | } 235 | 236 | #' Convert a cascade object to a matrix 237 | #' 238 | #' Generates a \code{\link{matrix}} containing the cascade information in the 239 | #' cascade object in wide format. Missing values are used for nodes that do not 240 | #' experience an event in a cascade. 241 | #' 242 | #' @param x cascade object to convert. 243 | #' @param ... additional arguments to be passed to or from methods. 244 | #' (Currently not supported.) 245 | #' 246 | #' @return A matrix containing all cascade information in wide format. That is, 247 | #' each row of the matrix corresponds to a node and each column to a cascade. 248 | #' Cell entries are event times. Censored nodes have \code{NA} for their entry. 249 | #' 250 | #' @examples 251 | #' 252 | #' data(cascades) 253 | #' as.matrix(cascades) 254 | #' 255 | #' @export 256 | as.matrix.cascade <- function(x, ...) { 257 | 258 | # Check inputs 259 | assert_that(inherits(x, "cascade")) 260 | 261 | cids <- names(x$cascade_times) 262 | nids <- x$node_names 263 | 264 | time_lookup_ <- function(pair) { 265 | cid <- pair[1] 266 | nid <- pair[2] 267 | match <- which(x$cascade_nodes[[cid]] == nid) 268 | if(length(match) == 1) { 269 | return(x$cascade_times[[cid]][match]) 270 | } else { 271 | return(NA) 272 | } 273 | } 274 | 275 | combos <- expand.grid(cids, nids) 276 | times <- apply(combos, 1, time_lookup_) 277 | 278 | # Reshape to matrix 279 | out <- matrix(times, nrow = length(nids), ncol = length(cids), byrow = TRUE) 280 | rownames(out) <- nids 281 | colnames(out) <- cids 282 | 283 | return(out) 284 | } 285 | 286 | #' Select a subset of cascades from cascade object 287 | #' 288 | #' @param cascade cascade, object to select from 289 | #' @param selection character or integer, vector of cascade_ids to select 290 | #' 291 | #' @return An object of class cascade containing just the selected cascades 292 | #' 293 | #' @examples 294 | #' 295 | #' data(policies) 296 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 297 | #' event_time = 'adopt_year', cascade_id = 'policy') 298 | #' cascade_names <- names(cascades$cascade_times) 299 | #' subset_cascade(cascades, selection = cascade_names[1:10]) 300 | #' 301 | #' @export 302 | subset_cascade <- function(cascade, selection) { 303 | # Check inputs 304 | assert_that(inherits(cascade, 'cascade')) 305 | cascade_names <- names(cascade$cascade_times) 306 | assert_that(all(selection %in% cascade_names)) 307 | 308 | cascade_times <- cascade$cascade_times[selection] 309 | cascade_nodes <- cascade$cascade_nodes[selection] 310 | #node_names <- unique(do.call(c, cascade_nodes)) 311 | node_names <- cascade$node_names 312 | out <- list(cascade_nodes = cascade_nodes, cascade_times = cascade_times, 313 | node_names = node_names) 314 | class(out) <- c('cascade', 'list') 315 | return(out) 316 | } 317 | 318 | #' Drop nodes from a cascade object 319 | #' 320 | #' @param cascades cascade, object to drop nodes from. 321 | #' @param nodes character or integer, vector of node_ids to drop. 322 | #' @param drop logical, Should empty cascades be dropped. 323 | #' 324 | #' @return An object of class cascade containing the cascades without the 325 | #' dropped nodes. 326 | #' 327 | #' @examples 328 | #' 329 | #' data(policies) 330 | #' cascades <- as_cascade_long(policies, cascade_node_name = 'statenam', 331 | #' event_time = 'adopt_year', cascade_id = 'policy') 332 | #' new_cascades <- drop_nodes(cascades, c("California", "New York")) 333 | #' 334 | #' @export 335 | drop_nodes <- function(cascades, nodes, drop = TRUE) { 336 | # Check inputs 337 | assert_that(inherits(cascades, 'cascade')) 338 | assert_that(all(nodes %in% cascades$node_names)) 339 | c_length <- length(cascades$cascade_nodes) 340 | cascade_ids <- names(cascades$cascade_nodes) 341 | 342 | # Drop nodes from cascade_nodes, cascade_times and node_names 343 | drop_idxs <- lapply(cascades$cascade_nodes, function(x) which(x %in% nodes)) 344 | cascade_nodes <- lapply(1:c_length, function(i) { 345 | cascades$cascade_nodes[[i]][-drop_idxs[[i]]] 346 | }) 347 | names(cascade_nodes) <- cascade_ids 348 | cascade_times <- lapply(1:c_length, function(i) { 349 | cascades$cascade_times[[i]][-drop_idxs[[i]]] 350 | }) 351 | names(cascade_times) <- cascade_ids 352 | node_names <- cascades$node_names[!cascades$node_names %in% nodes] 353 | 354 | # Check for empty cascades 355 | if(drop) { 356 | cascade_nodes <- remove_zero_length_(cascade_nodes) 357 | cascade_times <- remove_zero_length_(cascade_times) 358 | } 359 | 360 | out <- list(cascade_nodes = cascade_nodes, cascade_times = cascade_times, 361 | node_names = node_names) 362 | class(out) <- c('cascade', 'list') 363 | return(out) 364 | } 365 | 366 | 367 | 368 | #' Subset a cascade object in time 369 | #' 370 | #' Remove each all events occurring outside the desired subset for each cascade 371 | #' in a cascade object. 372 | #' 373 | #' @param cascade cascade, object to subset. 374 | #' @param start_time numeric, start time of the subset. 375 | #' @param end_time numeric, end time of the subset. 376 | #' @param drop logical, should empty sub-cascades be dropped? 377 | #' 378 | #' @return An object of class cascade, where only events are included that have 379 | #' times \code{start_time} <= t < \code{end_time}. 380 | #' 381 | #' @examples 382 | #' 383 | #' data(cascades) 384 | #' sub_cascades <- subset_cascade_time(cascades, 10, 20, drop=TRUE) 385 | #' 386 | #' @export 387 | subset_cascade_time <- function(cascade, start_time, end_time, drop=TRUE) { 388 | # Check inputs 389 | assert_that(inherits(cascade, 'cascade')) 390 | qassert(start_time, 'N1') 391 | qassert(end_time, 'N1') 392 | qassert(drop, 'B1') 393 | 394 | casc_length <- length(cascade$cascade_nodes) 395 | subset_idxs <- lapply(cascade$cascade_times, function(x) { 396 | out <- which(x >= start_time & x < end_time) 397 | }) 398 | subset_times <- lapply(1:casc_length, function(x) { 399 | return(cascade$cascade_times[[x]][subset_idxs[[x]]]) 400 | }) 401 | subset_nodes <- lapply(1:casc_length, function(x) { 402 | return(cascade$cascade_nodes[[x]][subset_idxs[[x]]]) 403 | }) 404 | names(subset_times) <- names(subset_nodes) <- names(cascade$cascade_times) 405 | if(drop) { 406 | subset_times <- remove_zero_length_(subset_times) 407 | subset_nodes <- remove_zero_length_(subset_nodes) 408 | } 409 | #subset_node_names <- unique(do.call(c, subset_nodes)) 410 | subset_node_names <- cascade$node_names 411 | out <- list(cascade_nodes = subset_nodes, cascade_times = subset_times, 412 | node_names = subset_node_names) 413 | class(out) <- c("cascade", "list") 414 | return(out) 415 | } 416 | 417 | # Remove vectors of length zero from a list of vectors 418 | remove_zero_length_ <- function(x) { 419 | out <- lapply(x, function(y) if(length(y) == 0) return() else return(y)) 420 | return(Filter(Negate(is.null), out)) 421 | } 422 | 423 | #' Convert a cascade object to a data frame 424 | #' 425 | #' Generates a data frame containing the cascade information in the cascade object. 426 | #' 427 | #' @param x Cascade object to convert. 428 | #' @param row.names NULL or a character vector giving the row names for the data 429 | #' frame. Missing values are not allowed. 430 | #' @param optional logical. If TRUE, setting row names and converting column 431 | #' names (to syntactic names: see make.names) is optional. (Not supported) 432 | #' @param ... Additional arguments passed to \code{\link{data.frame}}. 433 | #' 434 | #' @return A data frame with three columns. Containing 1) The names of 435 | #' the nodes (\code{"node_name"}) that experience an event in each cascade, 436 | #' 2) the event time (\code{"event_time"}) of the corresponding node, 437 | #' 3) the cascade identifier \code{"cascade_id"}. 438 | #' 439 | #' @examples 440 | #' 441 | #' data(cascades) 442 | #' as.data.frame(cascades) 443 | #' 444 | #' @export 445 | as.data.frame.cascade <- function(x, row.names = NULL, optional = FALSE, 446 | ...) { 447 | # Check inputs 448 | assert_that(inherits(x, "cascade")) 449 | 450 | # Warning for zero length cascades that will be dropped 451 | zero_length <- sapply(x$cascade_nodes, function(y) length(y) == 0) 452 | if(any(zero_length)) { 453 | dropped <- names(x$cascade_nodes)[zero_length] 454 | msg <- paste("The following cascades have no events and will be dropped: ", 455 | paste(dropped, collapse = " ")) 456 | warning(msg) 457 | } 458 | 459 | # Convert 460 | cascade_nodes <- do.call(c, x$cascade_nodes) 461 | cascade_times <- do.call(c, x$cascade_times) 462 | smry <- summary(x, quiet = TRUE) 463 | ids <- apply(smry, 1, function(x) rep(x[1], each = x[2])) 464 | # apply generates matrix if nrow(smry) = 1 and list if > 1, generate 465 | # consistent output: 466 | if(inherits(ids, "matrix")) { 467 | cascade_ids <- as.character(ids) 468 | } else { 469 | cascade_ids <- do.call(c, ids) 470 | } 471 | out <- data.frame("node_name" = cascade_nodes, 'event_time' = cascade_times, 472 | "cascade_id" = cascade_ids, 473 | stringsAsFactors = FALSE, ...) 474 | if(!is.null(row.names)) { 475 | row.names(out) <- row.names 476 | } else { 477 | row.names(out) <- as.character(c(1:nrow(out))) 478 | } 479 | return(out) 480 | } 481 | 482 | # Sort cascades by event time 483 | # 484 | # @param cascades, object of class cascade 485 | # 486 | # @return An object of class cascade with each cascade (ids and times) ordered 487 | # by event time 488 | order_cascade_ <- function(cascades) { 489 | 490 | casc_names <- names(cascades$cascade_times) 491 | 492 | sort_times <- function(x) return(cascades$cascade_times[[x]][orderings[[x]]]) 493 | sort_nodes <- function(x) return(cascades$cascade_nodes[[x]][orderings[[x]]]) 494 | 495 | orderings <- lapply(cascades$cascade_times, order) 496 | times <- lapply(c(1:length(cascades$cascade_times)), sort_times) 497 | ids <- lapply(c(1:length(cascades$cascade_nodes)), sort_nodes) 498 | 499 | cascades$cascade_nodes <- ids 500 | cascades$cascade_times <- times 501 | 502 | names(cascades$cascade_nodes) <- casc_names 503 | names(cascades$cascade_times) <- casc_names 504 | 505 | return(cascades) 506 | } 507 | 508 | assert_cascade_consistency_ <- function(cascade_nodes, cascade_times, 509 | node_names) { 510 | 511 | # Check if containers for nodes and event times have same length (same number 512 | # of cascades) 513 | if(length(cascade_nodes) != length(cascade_times)) { 514 | stop("cascade_nodes is not the same length as cascade_times.", 515 | call. = FALSE) 516 | } 517 | 518 | # Check if each cascade has same length in event time and node name container 519 | lens_ids <- sapply(cascade_nodes, length) 520 | lens_times <- sapply(cascade_times, length) 521 | if(any(lens_ids != lens_times)) { 522 | stop("Corresponding elements in cascade_nodes and cascade_times are not of 523 | equal length.", call. = FALSE) 524 | } 525 | 526 | # Check that all cascade elements are of correct class 527 | tids <- sapply(cascade_nodes, function(x) assert_that(is.element(class(x), 528 | c("numeric", "character", 529 | "factor", "integer")))) 530 | ttimes <- sapply(cascade_times, function(x) assert_that(is.element(class(x), 531 | c("numeric", "integer")))) 532 | if(!all(tids)) { 533 | stop("At least one element of cascade_nodes is not of class numeric, 534 | integer, character or factor or contains missing values.", 535 | call. = FALSE) 536 | } 537 | if(!all(ttimes)) { 538 | stop("At least one element of cascade_times is not of class numeric 539 | or contains missing values.", call. = FALSE) 540 | } 541 | 542 | # Check consistency between node names in cascades and provided node names 543 | unique_cascade_nodes <- unique(do.call(c, cascade_nodes)) 544 | chk <- is.element(unique_cascade_nodes, node_names) 545 | if(!all(chk)) { 546 | msg <- paste0("The following node(s) that occur in the cascades are ", 547 | "not contained in provided node_names:\n", 548 | unique_cascade_nodes[!chk], "\nPlease provide the full ", 549 | "list of node names.") 550 | stop(msg, .call = FALSE) 551 | } 552 | } --------------------------------------------------------------------------------