31 |
--------------------------------------------------------------------------------
/man/logistic_solve1.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/RcppExports.R
3 | \name{logistic_solve1}
4 | \alias{logistic_solve1}
5 | \title{logistic_fit}
6 | \usage{
7 | logistic_solve1(x, y, w, initial_link, i, j, skip)
8 | }
9 | \arguments{
10 | \item{x}{NumericVector, expanatory variable.}
11 |
12 | \item{y}{NumericVector, 0/1 values to fit.}
13 |
14 | \item{w}{NumericVector, weights (required, positive).}
15 |
16 | \item{initial_link, }{initial link estimates (required, all zeroes is a good start).}
17 |
18 | \item{i}{integer, first index (inclusive).}
19 |
20 | \item{j}{integer, last index (inclusive).}
21 |
22 | \item{skip}{integer, index to skip (-1 to not skip).}
23 | }
24 | \value{
25 | vector of a and b.
26 | }
27 | \description{
28 | Calculate y ~ sigmoid(a + b x) using iteratively re-weighted least squares.
29 | Zero indexed.
30 | }
31 | \examples{
32 |
33 | set.seed(5)
34 | d <- data.frame(
35 | x = rnorm(10),
36 | y = sample(c(0,1), 10, replace = TRUE)
37 | )
38 | weights <- runif(nrow(d))
39 | m <- glm(y~x, data = d, family = binomial, weights = weights)
40 | coef(m)
41 | logistic_solve1(d$x, d$y, weights, rep(0.0, nrow(d)), 0, nrow(d)-1, -1)
42 |
43 | }
44 | \keyword{internal}
45 |
--------------------------------------------------------------------------------
/extras/check_reverse_dependencies.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "check_reverse_dependencies"
3 | output: github_document
4 | ---
5 |
6 | ```{r, error=TRUE}
7 | library("prrd")
8 | td <- tempdir()
9 | package = "RcppDynProg"
10 | packageVersion(package)
11 | date()
12 |
13 | parallelCluster <- NULL
14 | # # parallel doesn't work due to https://github.com/r-lib/liteq/issues/22
15 | #ncores <- parallel::detectCores()
16 | #parallelCluster <- parallel::makeCluster(ncores)
17 |
18 | orig_dir <- getwd()
19 | print(orig_dir)
20 | setwd(td)
21 | print(td)
22 |
23 | options(repos = c(CRAN="https://cloud.r-project.org"))
24 | jobsdfe <- enqueueJobs(package=package, directory=td)
25 |
26 | mk_fn <- function(package, directory) {
27 | force(package)
28 | force(directory)
29 | function(i) {
30 | library("prrd")
31 | setwd(directory)
32 | Sys.sleep(1*i)
33 | dequeueJobs(package=package, directory=directory)
34 | }
35 | }
36 | f <- mk_fn(package=package, directory=td)
37 |
38 | if(!is.null(parallelCluster)) {
39 | parallel::parLapply(parallelCluster, seq_len(ncores), f)
40 | } else {
41 | f(0)
42 | }
43 |
44 | summariseQueue(package=package, directory=td)
45 |
46 | setwd(orig_dir)
47 | if(!is.null(parallelCluster)) {
48 | parallel::stopCluster(parallelCluster)
49 | }
50 |
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/R/xlin_fits_V.R:
--------------------------------------------------------------------------------
1 |
2 | #' xlin_fits_R
3 | #'
4 | #' Calculate out of sample linear fit predictions.
5 | #'
6 | #' @param x NumericVector, x-coords of values to group (length>=2).
7 | #' @param y NumericVector, values to group in order.
8 | #' @param w NumericVector, weights (positive).
9 | #' @return vector of predictions.
10 | #'
11 | #' @keywords internal
12 | #'
13 | #' @examples
14 | #'
15 | #' xlin_fits_V(c(1, 2, 3, 4), c(1, 2, 2, 1), c(1, 1, 1, 1))
16 | #'
17 | #' @export
18 | #'
19 | xlin_fits_V <- function(x, y, w) {
20 | n = length(y)
21 | # build fitting data
22 | regularization = 1.0e-5
23 | xx_0_0 = numeric(n) + sum(w*1)
24 | xx_1_0 = numeric(n) + sum(w*x)
25 | xx_0_1 = numeric(n) + sum(w*x)
26 | xx_1_1 = numeric(n) + sum(w*x*x)
27 | xy_0 = numeric(n) + sum(w*y)
28 | xy_1 = numeric(n) + sum(w*x*y)
29 | xx_1_0 = xx_1_0 + regularization
30 | xx_0_1 = xx_0_1 + regularization
31 | # pull out k'th observation
32 | xxk_0_0 = xx_0_0 - w*1
33 | xxk_1_0 = xx_1_0 - w*x
34 | xxk_0_1 = xx_0_1 - w*x
35 | xxk_1_1 = xx_1_1 - w*x*x
36 | xyk_0 = xy_0 - w*y
37 | xyk_1 = xy_1 - w*x*y
38 | # solve linear system
39 | det = xxk_0_0*xxk_1_1 - xxk_0_1*xxk_1_0
40 | c0 = (xxk_1_1*xyk_0 - xxk_0_1*xyk_1)/det
41 | c1 = (-xxk_1_0*xyk_0 + xxk_0_0*xyk_1)/det
42 | # form estimate
43 | y_est = c0 + c1*x
44 | return(y_est)
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/extras/xlin_fits_py.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | Created on Sun Dec 30 08:46:34 2018
5 |
6 | @author: johnmount
7 | """
8 |
9 | import numpy
10 |
11 | def xlin_fits_py(x, y, w):
12 | """return all out of sample fits of y~a*x+b weighted by w, all values numpy arrays"""
13 | n = len(y)
14 | # build fitting data
15 | regularization = 1.0e-5
16 | xx_0_0 = numpy.zeros(n) + numpy.sum(w*1)
17 | xx_1_0 = numpy.zeros(n) + numpy.sum(w*x)
18 | xx_0_1 = numpy.zeros(n) + numpy.sum(w*x)
19 | xx_1_1 = numpy.zeros(n) + numpy.sum(w*x*x)
20 | xy_0 = numpy.zeros(n) + numpy.sum(w*y)
21 | xy_1 = numpy.zeros(n) + numpy.sum(w*x*y)
22 | xx_1_0 = xx_1_0 + regularization
23 | xx_0_1 = xx_0_1 + regularization
24 | # pull out k'th observation
25 | xxk_0_0 = xx_0_0 - w*1
26 | xxk_1_0 = xx_1_0 - w*x
27 | xxk_0_1 = xx_0_1 - w*x
28 | xxk_1_1 = xx_1_1 - w*x*x
29 | xyk_0 = xy_0 - w*y
30 | xyk_1 = xy_1 - w*x*y
31 | # solve linear system
32 | det = xxk_0_0*xxk_1_1 - xxk_0_1*xxk_1_0
33 | c0 = (xxk_1_1*xyk_0 - xxk_0_1*xyk_1)/det
34 | c1 = (-xxk_1_0*xyk_0 + xxk_0_0*xyk_1)/det
35 | # form estimate
36 | y_est = c0 + c1*x
37 | return(y_est)
38 |
39 | # x = numpy.asarray([1 ,2, 3, 4])
40 | # y = numpy.asarray([1, 2, 2, 1])
41 | # w = numpy.asarray([1, 1, 1, 1])
42 | # xlin_fits_py(x, y, w)
43 | ## array([2.666715 , 1.28571541, 1.28571214, 2.66666833])
--------------------------------------------------------------------------------
/extras/Timings.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Timings"
3 | output: github_document
4 | ---
5 |
6 |
7 |
8 |
9 | ```{r}
10 | knitr::opts_chunk$set(fig.width=12, fig.height=8)
11 | library("RcppDynProg")
12 | library("WVPlots")
13 | library("microbenchmark")
14 | library("rqdatatable")
15 |
16 |
17 | set.seed(2018)
18 | n <- 500
19 | x <- matrix(runif(n*n), nrow=n, ncol=n)
20 |
21 | solve_interval_partition(x, n)
22 |
23 | solve_interval_partition_R(x, n)
24 |
25 | timings <- microbenchmark(
26 | solve_interval_partition(x, n),
27 | solve_interval_partition_R(x, n),
28 | times = 5L)
29 |
30 | print(timings)
31 |
32 | p <- data.frame(timings)
33 | p$seconds <- p$time/1e+9
34 | p$method <- as.factor(p$expr)
35 | p$method <- reorder(p$method, p$seconds)
36 |
37 | summary <- p %.>%
38 | project(.,
39 | mean_seconds = mean(seconds),
40 | groupby = "method")
41 | print(summary)
42 | ratio <- max(summary$mean_seconds)/min(summary$mean_seconds)
43 | print(ratio)
44 |
45 | WVPlots::ScatterBoxPlotH(p,
46 | "seconds", "method",
47 | "performance of same dynamic programming code in R and Rcpp (C++)")
48 | ```
49 |
50 |
51 |
52 |
53 | ---------------------
54 |
55 |
56 | Timings on a 2018 Dell XPS-13 laptop, 16 Gib RAM, LPDDR3, 2133 MT/s, Intel(R) Core(TM) i5-8250U CPU @ 1.60GHz (8 cores reported), idle, charged, and plugged into power supply. Ubuntu 18.04.1 LTS.
57 |
58 | ```{r}
59 | R.version.string
60 |
61 | R.version
62 |
63 | sessionInfo()
64 | ```
65 |
66 |
--------------------------------------------------------------------------------
/inst/tinytest/test_scoring.R:
--------------------------------------------------------------------------------
1 |
2 | test_scoring <- function() {
3 | set.seed(2018)
4 | g <- 50
5 | d <- data.frame(
6 | x = 1:(3*g)) # ordered in x
7 | d$y_ideal <- c(rep(0, g), rep(1, g), rep(-1, g))
8 | d$y_observed <- d$y_ideal + rnorm(length(d$y_ideal))
9 | d$w <- 1 + numeric(nrow(d))
10 |
11 | # expected loss of using the mean of other points to
12 | # estimate each point
13 | inflated_var <- function(x, penalty) {
14 | n <- length(x)
15 | if(n<=1) {
16 | return(100000)
17 | }
18 | meanx <- mean(x)
19 | (n/(n-1))^2*sum((x-meanx)^2) + penalty/sqrt(n)
20 | }
21 |
22 | c1 <- inflated_var(d$y_observed, 0)
23 | c2 <- const_cost(d$y_observed, d$w, 1, 0, length(d$y_observed-1)-1)
24 | expect_true(abs(c1-c2)<1.e-6)
25 |
26 | y_permuted <- d$y_ideal[sample.int(nrow(d), nrow(d), replace = FALSE)]
27 |
28 | create_cost_matrix <- function(ycol, penalty) {
29 | n <- length(ycol)
30 | x <- matrix(0, nrow=n, ncol=n)
31 | for(i in 1:n) {
32 | x[i,i] <- 100000 # big penalty
33 | if(i
59 |
60 | ``` r
61 | costs <- lin_costs_logistic(d$x, d$y_observed, w, 40, seq_len(nrow(d)))
62 | costs <- costs + 5
63 | soln <- solve_interval_partition(costs, 20)
64 | print(soln)
65 | ```
66 |
67 | ## [1] 1 59 255 315 424 471 601
68 |
69 | ``` r
70 | preds <- numeric(nrow(d))
71 | for(i in seqi(1, length(soln)-1)) {
72 | predsi <- logistic_fits(d$x, d$y_observed, w, soln[[i]]-1, soln[[i+1]]-2)
73 | preds[seqi(soln[[i]], soln[[i+1]]-1)] <- predsi
74 | }
75 | d$link <- pmin(5, pmax(-5, preds))
76 | d$group <- findInterval(seq_len(nrow(d)), soln)
77 |
78 |
79 | if(plot) {
80 | plt2 <- ggplot(data = d, aes(x = x)) +
81 | geom_line(aes(y = y_ideal), linetype=2) +
82 | geom_line(aes(y = link, group = group), linetype=3, color="blue") +
83 | geom_point(aes(y = y_plot, color = as.factor(d$y_observed)), alpha = 0.5) +
84 | ylab("y") +
85 | guides(color = FALSE) +
86 | ggtitle("raw data",
87 | subtitle = "dots: observed values, cuts/curve: recovered model")
88 | for(ci in soln) {
89 | if((ci>1)&&(ci| \n", 61 | " | x | \n", 62 | "y_ideal | \n", 63 | "y_observed | \n", 64 | "
|---|---|---|---|
| 0 | \n", 69 | "0.05 | \n", 70 | "0.000225 | \n", 71 | "0.041055 | \n", 72 | "
| 1 | \n", 75 | "0.10 | \n", 76 | "0.000900 | \n", 77 | "0.502473 | \n", 78 | "
| 2 | \n", 81 | "0.15 | \n", 82 | "0.002025 | \n", 83 | "-0.027719 | \n", 84 | "
| 3 | \n", 87 | "0.20 | \n", 88 | "0.003600 | \n", 89 | "-0.260338 | \n", 90 | "
| 4 | \n", 93 | "0.25 | \n", 94 | "0.005625 | \n", 95 | "0.308052 | \n", 96 | "
vignettes/RcppDynProg.Rmd
90 | RcppDynProg.RmdThis content has moved to the package 97 | README.
98 |
70 |
71 | ``` r
72 | penalty <- 1
73 |
74 | soln <- solve_for_partition(sp500recent$x, sp500recent$log_price, penalty = penalty)
75 | sp500recent$estimate <- exp(approx(soln$x, soln$pred,
76 | xout = sp500recent$x,
77 | method = "linear", rule = 2)$y)
78 | sp500recent$group <- as.character(
79 | findInterval(sp500recent$x, soln[soln$what=="left", "x"]))
80 |
81 | ggplot(data = sp500recent, aes(x = Date)) +
82 | geom_line(aes(y=Adj.Close), color = "darkgray") +
83 | geom_line(aes(y=estimate, color = group)) +
84 | ggtitle("segment approximation of historic sp500 data",
85 | subtitle = paste("per-segment penalty =", penalty)) +
86 | theme(legend.position = "none") +
87 | scale_color_brewer(palette = "Dark2") +
88 | scale_y_log10()
89 | ```
90 |
91 |
92 |
93 | ``` r
94 | penalty <- 5
95 |
96 | soln <- solve_for_partition(sp500$x, sp500$log_price, penalty = penalty)
97 | sp500$estimate <- exp(approx(soln$x, soln$pred,
98 | xout = sp500$x,
99 | method = "linear", rule = 2)$y)
100 |
101 | sp500$group <- as.character(
102 | findInterval(sp500$x, soln[soln$what=="left", "x"]))
103 |
104 | ggplot(data = sp500, aes(x = Date)) +
105 | geom_line(aes(y=Adj.Close), color = "darkgray") +
106 | geom_line(aes(y=estimate, group = group), color = "darkgreen") +
107 | ggtitle("segment approximation of historic sp500 data",
108 | subtitle = paste("per-segment penalty =", penalty)) +
109 | theme(legend.position = "none") +
110 | scale_color_manual(values = colors) +
111 | scale_y_log10()
112 | ```
113 |
114 |
115 |
116 | Naive gaps (TODO: need to find breakpoints that are good for the no-gap solution).
117 |
118 | ``` r
119 | sl <- soln[soln$what=='left', ]
120 | fit <- vtreat:::encode_x_as_lambdas(
121 | sp500$x, min(sp500$x), max(sp500$x),
122 | sl$x)
123 | vars <- setdiff(colnames(fit), "intercept")
124 | fit$y <- sp500$log_price
125 | fmla <- wrapr::mk_formula("y", vars)
126 | model <- lm(fmla, data = fit)
127 | sp500$pred <- exp(predict(model, newdata = fit))
128 |
129 | ggplot(data = sp500, aes(x = Date)) +
130 | geom_line(aes(y=Adj.Close), color = "darkgray") +
131 | geom_line(aes(y=pred), color = "darkgreen") +
132 | ggtitle("segment approximation (no gaps) of historic sp500 data",
133 | subtitle = paste("per-segment penalty =", penalty)) +
134 | theme(legend.position = "none") +
135 | scale_color_brewer(palette = "Dark2") +
136 | scale_y_log10()
137 | ```
138 |
139 |
140 |
141 | Fit piecewise constant on delta series.
142 |
143 | ``` r
144 | penalty <- 0.01
145 |
146 | sp500$delta_log_price <- c(0, sp500$log_price[-1] - sp500$log_price[-nrow(sp500)])
147 |
148 | soln <- solve_for_partitionc(sp500$x, sp500$delta_log_price, penalty = penalty)
149 | sl <- soln[soln$what=='left', ]
150 | fit <- vtreat:::encode_x_as_lambdas(
151 | sp500$x, min(sp500$x), max(sp500$x),
152 | sl$x)
153 | vars <- setdiff(colnames(fit), "intercept")
154 | fit$y <- sp500$log_price
155 | fmla <- wrapr::mk_formula("y", vars)
156 | model <- lm(fmla, data = fit)
157 | sp500$pred <- exp(predict(model, newdata = fit))
158 |
159 | ggplot(data = sp500, aes(x = Date)) +
160 | geom_line(aes(y=Adj.Close), color = "darkgray") +
161 | geom_line(aes(y=pred), color = "darkgreen") +
162 | ggtitle("segment approximation (slope ests, no gaps) of historic sp500 data",
163 | subtitle = paste("per-segment penalty =", penalty)) +
164 | theme(legend.position = "none") +
165 | scale_color_brewer(palette = "Dark2") +
166 | scale_y_log10()
167 | ```
168 |
169 |
170 |
--------------------------------------------------------------------------------
/docs/authors.html:
--------------------------------------------------------------------------------
1 |
2 | Rcpp dynamic programming solutions for partitioning and machine learning problems. 139 | Includes out of sample fitting applications. 140 | Also supplies additional custom coders for the vtreat package. 141 | Please see https://github.com/WinVector/RcppDynProg for details.
142 |John Mount
149 | 150 |