In this vignette, we reproduce Table 9.1, p. 367, of Boos and Stefanski (2013), “Essential Statistical Inference.” This table summarizes findings from a simulation study of the sampling characteristics of three estimators (the sample mean, a trimmed mean and the sample median) under random sampling with three population distributions and three sample sizes. In particular, the table summarizes Monte Carlo estimation of the variance (times n) of these estimators. The function `mc.se.vector’ enables calculation of the Monte Carlo standard error (SE) associated with these variance estimates. The first part of the vignette creates several entries to show the basic code. The second part creates the full table with SEs.
To get started we generate N=10,000 data sets of size n=15 from the normal(0,1) distribution.
N <- 10000
set.seed(346) # sets the random number seed
z <- matrix(rnorm(N*15),nrow=N) # N rows of N(0,1) samples, n=15
Then create vectors of N=10,000 means, 20% trimmed means, and medians computed from samples of size n=15.
out.m.15 <- apply(z,1,mean) # mean for each sample
out.t20.15 <- apply(z,1,mean,trim=0.20) # 20% trimmed mean for each sample
out.med.15 <- apply(z,1,median) # median for each sample
Now we use mc.se.vector to get
some of the main table entries and associated jackknife standard errors
for n
times the sample variance of the three estimators
(mean, trimmed mean, median) using the R function
varn=function(x,n){n*var(x)}
. Note that n=15
is an extra parameter to summary.f=varn
.
> mc.se.vector(out.m.15,summary.f=varn,n=15)
summary se N method
1 0.9885314 0.01407249 10000 Jackknife
> mc.se.vector(out.t20.15,summary.f=varn,n=15)
summary se N method
1 1.111288 0.01579671 10000 Jackknife
> mc.se.vector(out.med.15,summary.f=varn,n=15)
summary se N method
1 1.472833 0.02110376 10000 Jackknife
Rounding, we can see that the rounded first entries above, (0.99, 1.11, 1.47), are the first 3 elements in the first row of Table 9.1.
Next we illustrate similar code (adding B= and seed=) to get bootstrap SEs.
> mc.se.vector(out.m.15,B=1000,seed=583,summary.f=varn,n=15)
summary se n method B seed
1 0.9885314 0.01444097 10000 Bootstrap 1000 583
> mc.se.vector(out.t20.15,B=1000,seed=264,summary.f=varn,n=15)
summary se n method B seed
1 1.111288 0.01557149 10000 Bootstrap 1000 264
> mc.se.vector(out.med.15,B=1000,seed=520,summary.f=varn,n=15)
summary se n method B seed
1 1.472833 0.02208845 10000 Bootstrap 1000 520
The main table entries (“summary”) are identical to the jackknife runs, but the bootstrap SEs are slightly different from the jackknife SEs in the 3rd decimal place.
Here we first generate all of the Monte Carlo output, and then compute all the main entries and SEs of Table 9.1. We start again by generating N=10,000 data sets of size n=15 from the normal(0,1) distribution.
N <- 10000
set.seed(346) # sets the random number seed
z <- matrix(rnorm(N*15),nrow=N) # N rows of N(0,1) samples, n=15
and create vectors of N=10,000 means, 20% trimmed means, and medians computed from samples of size n=15.
out.m.15 <- apply(z,1,mean) # mean for each sample
out.t20.15 <- apply(z,1,mean,trim=0.20) # 20% trimmed mean for each sample
out.med.15 <- apply(z,1,median) # median for each sample
But now we store the estimates in a matrix raw.out
that
will hold all 27 columns of the output.
raw.out <- matrix(0,nrow=10000,ncol=27) # to hold all the estimators
raw.out[,1] <- out.m.15
raw.out[,2] <- out.t20.15
raw.out[,3] <- out.med.15
Repeat for n=30
set.seed(347)
z <- matrix(rnorm(N*30),nrow=N)
out.m.30 <- apply(z,1,mean)
out.t20.30 <- apply(z,1,mean,trim=0.20)
out.med.30 <- apply(z,1,median)
raw.out[,4] <- out.m.30
raw.out[,5] <- out.t20.30
raw.out[,6] <- out.med.30
and n=120
set.seed(348)
z <- matrix(rnorm(N*120),nrow=N)
out.m.120 <- apply(z,1,mean)
out.t20.120 <- apply(z,1,mean,trim=0.20)
out.med.120 <- apply(z,1,median)
raw.out[,7] <- out.m.120
raw.out[,8] <- out.t20.120
raw.out[,9] <- out.med.120
Repeat the above for Laplace and t5 distributions (see Extra Code at the end). After running the extra
code, we have an N=10,000 by 27 matrix raw.out
that
contains all three estimators for 3 different sample sizes and three
different distributions.
Let’s compute the main table entries and the jackknife SEs. The next
3 lines of code are just for indexing so that we pull off the right
columns from raw.out
and enter the sample sizes held in
index.n
.
entry.table9.1 <- matrix(0,nrow=3,ncol=9)
se.table9.1 <- matrix(0,nrow=3,ncol=9)
index.m <- c(1:3,10:12,19:21,4:6,13:15,22:24,7:9,16:18,25:27)
index.n <- c(rep(15,9),rep(30,9),rep(120,9))
We use mc.se.vector to get the
jackknife standard errors for n
times the sample variance
of the three estimators (mean, trimmed mean, median) using the R
function varn=function(x,n){n*var(x)}
. Note that
n=
is an extra parameter to summary.f=varn
.
The main entries in Table 9.1 are also returned by
mc.se.vector
.
for(i in 1:9){
out <- mc.se.vector(raw.out[,index.m[i]],summary.f=varn,n=index.n[i])
entry.table9.1[1,i] <- out$summary
se.table9.1[1,i] <- out$se
}
for(i in 1:9){
out <- mc.se.vector(raw.out[,index.m[i+9]],summary.f=varn,n=index.n[i+9])
entry.table9.1[2,i] <- out$summary
se.table9.1[2,i] <- out$se
}
for(i in 1:9){
out <- mc.se.vector(raw.out[,index.m[i+18]],summary.f=varn,n=index.n[i+18])
entry.table9.1[3,i] <- out$summary
se.table9.1[3,i] <- out$se
}
table.9.1 <- data.frame(round(entry.table9.1,2))
table.9.1.jackknife.se <- data.frame(round(se.table9.1,3))
names(table.9.1) <- c("n.m","n.t20","n.med","l.m","l.t20","l.med","t.m","t.t20","t.med")
row.names(table.9.1) <- c("n=15","n=30","n=120")
names(table.9.1.jackknife.se) <- c("n.m","n.t20","n.med","l.m","l.t20","l.med","t.m","t.t20","t.med")
row.names(table.9.1.jackknife.se) <- c("n=15","n=30","n=120")
The results are
> table.9.1
n.m n.t20 n.med l.m l.t20 l.med t.m t.t20 t.med
n=15 0.99 1.11 1.47 1.00 0.70 0.71 1.02 0.85 1.06
n=30 1.02 1.16 1.53 0.99 0.67 0.64 1.00 0.81 1.00
n=120 1.01 1.15 1.57 0.99 0.65 0.57 1.00 0.83 1.05
> table.9.1.jackknife.se
n.m n.t20 n.med l.m l.t20 l.med t.m t.t20 t.med
n=15 0.014 0.016 0.021 0.015 0.011 0.012 0.015 0.012 0.016
n=30 0.015 0.016 0.022 0.015 0.010 0.010 0.015 0.012 0.014
n=120 0.014 0.017 0.023 0.014 0.009 0.009 0.014 0.012 0.015
We recommend reporting only a summary of these standard errors in the table footnote. For example, we used the range 0.01 to 0.02 in the Note at the bottom of Table 9.1 based on
> range(table.9.1.jackknife.se)
[1] 0.009 0.023
But we might have reported the average SE from
> mean(as.matrix(table.9.1.jackknife.se))
[1] 0.01437037
Next we repeat the above steps, but adding B
and
seed
, to get bootstrap SEs.
se.Boot <- matrix(0,nrow=3,ncol=9)
index.m <- c(1:3,10:12,19:21,4:6,13:15,22:24,7:9,16:18,25:27)
index.n <- c(rep(15,9),rep(30,9),rep(120,9))
set.seed(3928)
iseed <- sample(1:1000,9)
for(i in 1:9){se.Boot[1,i] <- mc.se.vector(raw.out[,index.m[i]],
summary.f=varn,B=1000,seed=iseed[i],n=index.n[i])$se}
for(i in 1:9){se.Boot[2,i] <- mc.se.vector(raw.out[,index.m[i+9]],
summary.f=varn,B=1000,seed=iseed[i]+50,n=index.n[i+9])$se}
for(i in 1:9){se.Boot[3,i] <- mc.se.vector(raw.out[,index.m[i+18]],
summary.f=varn,B=1000,seed=iseed[i]+100,n=index.n[i+18])$se}
table.9.1.bootstrap.se <- data.frame(round(se.Boot,3))
names(table.9.1.bootstrap.se) <- c("n.m","n.t20","n.med","l.m","l.t20","l.med","t.m","t.t20","t.med")
row.names(table.9.1.bootstrap.se) <- c("n=15","n=30","n=120")
The results are very similar to the jackknife:
> table.9.1.bootstrap.se
n.m n.t20 n.med l.m l.t20 l.med t.m t.t20 t.med
n=15 0.014 0.016 0.021 0.015 0.011 0.013 0.016 0.013 0.016
n=30 0.015 0.017 0.022 0.015 0.010 0.010 0.015 0.012 0.014
n=120 0.014 0.016 0.022 0.014 0.009 0.009 0.014 0.012 0.015
> range(table.9.1.bootstrap.se)
[1] 0.009 0.022
> mean(as.matrix(table.9.1.bootstrap.se))
[1] 0.01444444
The following code fills out the other 18 columns of
raw.out
.
# Generate Laplace data sets
N <- 10000
set.seed(350) # sets the random number seed
z1 <- matrix(rexp(N*15),nrow=N)
z2 <- matrix(rexp(N*15),nrow=N)
z<-(z1-z2)/sqrt(2) # subtract standard exponentials
out.m.15 <- apply(z,1,mean)
out.t20.15 <- apply(z,1,mean,trim=0.20)
out.med.15 <- apply(z,1,median)
raw.out[,10]=out.m.15
raw.out[,11]=out.t20.15
raw.out[,12]=out.med.15
set.seed(351)
z1 <- matrix(rexp(N*30),nrow=N)
z2 <- matrix(rexp(N*30),nrow=N)
z<-(z1-z2)/sqrt(2)
out.m.30 <- apply(z,1,mean)
out.t20.30 <- apply(z,1,mean,trim=0.20)
out.med.30 <- apply(z,1,median)
raw.out[,13] <- out.m.30
raw.out[,14] <- out.t20.30
raw.out[,15] <- out.med.30
set.seed(352)
z1 <- matrix(rexp(N*120),nrow=N)
z2 <- matrix(rexp(N*120),nrow=N)
z<-(z1-z2)/sqrt(2)
out.m.120 <- apply(z,1,mean)
out.t20.120 <- apply(z,1,mean,trim=0.20)
out.med.120 <- apply(z,1,median)
raw.out[,16] <- out.m.120
raw.out[,17] <- out.t20.120
raw.out[,18] <- out.med.120
laplace.15 <- data.frame(mean=15*var(out.m.15),trim20=15*var(out.t20.15),median=15*var(out.med.15))
laplace.30 <- data.frame(mean=30*var(out.m.30),trim20=30*var(out.t20.30),median=30*var(out.med.30))
laplace.120 <- data.frame(mean=120*var(out.m.120),trim20=120*var(out.t20.120),median=120*var(out.med.120))
# Generate t(df=5) data sets
N <- 10000
set.seed(360) # sets the random number seed
z <- matrix(rt(N*15,df=5)/sqrt(5/3),nrow=N) # N rows of t5 standardized RVs
out.m.15 <- apply(z,1,mean)
out.t20.15 <- apply(z,1,mean,trim=0.20)
out.med.15 <- apply(z,1,median)
raw.out[,19] <- out.m.15
raw.out[,20] <- out.t20.15
raw.out[,21] <- out.med.15
set.seed(361)
z <- matrix(rt(N*30,df=5)/sqrt(5/3),nrow=N)
out.m.30 <- apply(z,1,mean)
out.t20.30 <- apply(z,1,mean,trim=0.20)
out.med.30 <- apply(z,1,median)
raw.out[,22] <- out.m.30
raw.out[,23] <- out.t20.30
raw.out[,24] <- out.med.30
set.seed(362)
z <- matrix(rt(N*120,df=5)/sqrt(5/3),nrow=N)
out.m.120 <- apply(z,1,mean)
out.t20.120 <- apply(z,1,mean,trim=0.20)
out.med.120 <- apply(z,1,median)
raw.out[,25] <- out.m.120
raw.out[,26] <- out.t20.120
raw.out[,27] <- out.med.120
t5.15 <- data.frame(mean=15*var(out.m.15),trim20=15*var(out.t20.15),median=15*var(out.med.15))
t5.30 <- data.frame(mean=30*var(out.m.30),trim20=30*var(out.t20.30),median=30*var(out.med.30))
t5.120 <- data.frame(mean=120*var(out.m.120),trim20=120*var(out.t20.120),median=120*var(out.med.120))