Skip to content

Commit b28ddde

Browse files
committed
large-data.Rmd: specicy r_eff argument
1 parent 26458ac commit b28ddde

1 file changed

Lines changed: 95 additions & 38 deletions

File tree

vignettes/large-data.Rmd

Lines changed: 95 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,12 @@ log-likelihood values in memory.
9191

9292
The log-likelihood in R can be coded as follows:
9393
```{r llfun_logistic}
94-
llfun_logistic <- function(data_i, draws) {
94+
# we'll add an argument log to toggle whether this is a log-likelihood or
95+
# likelihood function. this will be useful later in the vignette.
96+
llfun_logistic <- function(data_i, draws, log = TRUE) {
9597
x_i <- as.matrix(data_i[, which(grepl(colnames(data_i), pattern = "X")), drop=FALSE])
9698
logit_pred <- draws %*% t(x_i)
97-
dbinom(x = data_i$y, size = 1, prob = 1/(1 + exp(-logit_pred)), log = TRUE)
99+
dbinom(x = data_i$y, size = 1, prob = 1/(1 + exp(-logit_pred)), log = log)
98100
}
99101
```
100102

@@ -133,15 +135,28 @@ function we wrote is working as it should. The `loo_i()` function is a helper
133135
function that can be used to test a log-likelihood function on a single observation.
134136

135137
```{r, eval=FALSE}
138+
# used for draws argument to loo_i
136139
parameter_draws_1 <- extract(fit_1)$beta
140+
141+
# used for data argument to loo_i
137142
stan_df_1 <- as.data.frame(standata)
138-
loo_i(1, llfun_logistic, data = stan_df_1, draws = parameter_draws_1)
143+
144+
# compute relative efficiency (this is slow and optional but is recommended to allow
145+
# for adjusting PSIS effective sample size based on MCMC effective sample size)
146+
r_eff <- relative_eff(llfun_logistic,
147+
log = FALSE, # relative_eff wants likelihood not log-likelihood values
148+
chain_id = rep(1:4, each = 1000),
149+
data = stan_df_1,
150+
draws = parameter_draws_1,
151+
cores = 2)
152+
153+
loo_i(i = 1, llfun_logistic, r_eff = r_eff, data = stan_df_1, draws = parameter_draws_1)
139154
```
140155

141156
```
142157
$pointwise
143-
elpd_loo mcse_elpd_loo p_loo looic
144-
1 -0.3310342 0.0002908997 0.0003487243 0.6620683
158+
elpd_loo mcse_elpd_loo p_loo looic influence_pareto_k
159+
1 -0.3314552 0.0002887608 0.0003361772 0.6629103 -0.05679886
145160
...
146161
```
147162

@@ -151,16 +166,16 @@ We can then use the `loo_subsample()` function to compute the efficient PSIS-LOO
151166
approximation to exact LOO-CV using subsampling:
152167

153168
```{r, eval=FALSE}
154-
parameter_draws_1 <- extract(fit_1)$beta
155-
stan_df_1 <- as.data.frame(standata)
156-
157169
set.seed(4711)
158170
loo_ss_1 <-
159171
loo_subsample(
160172
llfun_logistic,
173+
observations = 100, # take a subsample of size 100
174+
cores = 2,
175+
# these next objects were computed above
176+
r_eff = r_eff,
161177
draws = parameter_draws_1,
162-
data = stan_df_1,
163-
observations = 100 # take a subsample of size 100
178+
data = stan_df_1
164179
)
165180
print(loo_ss_1)
166181
```
@@ -205,8 +220,14 @@ simply add more samples until we are satisfied using the `update()` method.
205220

206221
```{r, eval=FALSE}
207222
set.seed(4711)
208-
loo_ss_1b <- update(loo_ss_1, draws = parameter_draws_1, data = stan_df_1,
209-
observations = 200) # subsample 200 instead of 100
223+
loo_ss_1b <-
224+
update(
225+
loo_ss_1,
226+
observations = 200, # subsample 200 instead of 100
227+
r_eff = r_eff,
228+
draws = parameter_draws_1,
229+
data = stan_df_1
230+
)
210231
print(loo_ss_1b)
211232
```
212233

@@ -241,12 +262,14 @@ set.seed(4711)
241262
loo_ss_1c <-
242263
loo_subsample(
243264
x = llfun_logistic,
265+
r_eff = r_eff,
244266
draws = parameter_draws_1,
245267
data = stan_df_1,
246268
observations = 100,
247269
estimator = "hh_pps", # use Hansen-Hurwitz
248270
loo_approximation = "lpd", # use lpd instead of plpd
249-
loo_approximation_draws = 100
271+
loo_approximation_draws = 100,
272+
cores = 2
250273
)
251274
print(loo_ss_1c)
252275
```
@@ -297,7 +320,8 @@ loo_ap_1 <-
297320
draws = parameter_draws_laplace,
298321
data = stan_df_1,
299322
log_p = log_p,
300-
log_g = log_g
323+
log_g = log_g,
324+
cores = 2
301325
)
302326
print(loo_ap_1)
303327
```
@@ -340,7 +364,8 @@ loo_ap_ss_1 <-
340364
data = stan_df_1,
341365
log_p = log_p,
342366
log_g = log_g,
343-
observations = 100
367+
observations = 100,
368+
cores = 2
344369
)
345370
print(loo_ap_ss_1)
346371
```
@@ -387,23 +412,46 @@ parameter_draws_2 <- extract(fit_2)$beta
387412
stan_df_2 <- as.data.frame(standata)
388413
389414
# recompute subsampling loo for first model for demonstration purposes
415+
416+
# compute relative efficiency (this is slow and optional but is recommended to allow
417+
# for adjusting PSIS effective sample size based on MCMC effective sample size)
418+
r_eff_1 <- relative_eff(
419+
llfun_logistic,
420+
log = FALSE, # relative_eff wants likelihood not log-likelihood values
421+
chain_id = rep(1:4, each = 1000),
422+
data = stan_df_1,
423+
draws = parameter_draws_1,
424+
cores = 2
425+
)
426+
390427
set.seed(4711)
391-
loo_ss_1 <-
392-
loo_subsample(
393-
x = llfun_logistic,
394-
draws = parameter_draws_1,
395-
data = stan_df_1,
396-
observations = 200
397-
)
428+
loo_ss_1 <- loo_subsample(
429+
x = llfun_logistic,
430+
r_eff = r_eff_1,
431+
draws = parameter_draws_1,
432+
data = stan_df_1,
433+
observations = 200,
434+
cores = 2
435+
)
398436
399437
# compute subsampling loo for a second model (with log-arsenic)
400-
loo_ss_2 <-
401-
loo_subsample(
402-
x = llfun_logistic,
403-
draws = parameter_draws_2,
404-
data = stan_df_2,
405-
observations = 200
406-
)
438+
439+
r_eff_2 <- relative_eff(
440+
llfun_logistic,
441+
log = FALSE, # relative_eff wants likelihood not log-likelihood values
442+
chain_id = rep(1:4, each = 1000),
443+
data = stan_df_2,
444+
draws = parameter_draws_2,
445+
cores = 2
446+
)
447+
loo_ss_2 <- loo_subsample(
448+
x = llfun_logistic,
449+
r_eff = r_eff_2,
450+
draws = parameter_draws_2,
451+
data = stan_df_2,
452+
observations = 200,
453+
cores = 2
454+
)
407455
408456
print(loo_ss_2)
409457
```
@@ -455,9 +503,11 @@ we can simply extract the observations used in `loo_ss_1` and use them in
455503
loo_ss_2 <-
456504
loo_subsample(
457505
x = llfun_logistic,
506+
r_eff = r_eff_2,
458507
draws = parameter_draws_2,
459508
data = stan_df_2,
460-
observations = loo_ss_1
509+
observations = loo_ss_1,
510+
cores = 2
461511
)
462512
```
463513

@@ -466,13 +516,14 @@ helper function:
466516

467517
```{r, eval=FALSE}
468518
idx <- obs_idx(loo_ss_1)
469-
loo_ss_2 <-
470-
loo_subsample(
471-
x = llfun_logistic,
472-
draws = parameter_draws_2,
473-
data = stan_df_2,
474-
observations = idx
475-
)
519+
loo_ss_2 <- loo_subsample(
520+
x = llfun_logistic,
521+
r_eff = r_eff_2,
522+
draws = parameter_draws_2,
523+
data = stan_df_2,
524+
observations = idx,
525+
cores = 2
526+
)
476527
```
477528

478529
```
@@ -510,7 +561,13 @@ It is also possible to compare a subsampled loo computation with a full loo obje
510561

511562
```{r, eval=FALSE}
512563
# use loo() instead of loo_subsample() to compute full PSIS-LOO for model 2
513-
loo_full_2 <- loo(x = llfun_logistic, draws = parameter_draws_2, data = stan_df_2)
564+
loo_full_2 <- loo(
565+
x = llfun_logistic,
566+
r_eff = r_eff_2,
567+
draws = parameter_draws_2,
568+
data = stan_df_2,
569+
cores = 2
570+
)
514571
loo_compare(loo_ss_1, loo_full_2)
515572
```
516573

0 commit comments

Comments
 (0)