1 Bootstrap Distributions

  1. Construct a 94% bootstrap percentile confidence interval for the mean number of obstructive contacts (oc) in the EPIDURAL data set from the PASWR2 package. Use a for loop and the infer pipeline to construct your confidence intervals.
library(PASWR2)
library(tidyverse)
library(infer)
summary(EPIDURAL)
   doctor         kg               cm                ease   
 Dr. A:23   Min.   : 48.00   Min.   :152.0   Difficult :20  
 Dr. B:21   1st Qu.: 72.00   1st Qu.:160.0   Easy      :57  
 Dr. C:21   Median : 84.00   Median :165.0   Impossible: 8  
 Dr. D:20   Mean   : 86.08   Mean   :165.4                  
            3rd Qu.: 94.00   3rd Qu.:170.0                  
            Max.   :168.00   Max.   :185.0                  
                                                            
               treatment        oc                            complications
 Hamstring Stretch  :35   Min.   : 0.000   Failure - person got dizzy: 1   
 Traditional Sitting:50   1st Qu.: 0.000   Failure - too many OCs    : 2   
                          Median : 1.000   None                      :76   
                          Mean   : 1.286   Paresthesia               : 5   
                          3rd Qu.: 2.000   Wet Tap                   : 1   
                          Max.   :10.000                                   
                          NA's   :1                                        
EPIDURALC <- na.omit(EPIDURAL) # removing the NA value
summary(EPIDURALC)
   doctor         kg               cm                ease   
 Dr. A:23   Min.   : 48.00   Min.   :152.0   Difficult :20  
 Dr. B:21   1st Qu.: 72.00   1st Qu.:160.0   Easy      :56  
 Dr. C:20   Median : 84.50   Median :165.0   Impossible: 8  
 Dr. D:20   Mean   : 86.21   Mean   :165.3                  
            3rd Qu.: 94.00   3rd Qu.:170.0                  
            Max.   :168.00   Max.   :185.0                  
               treatment        oc                            complications
 Hamstring Stretch  :35   Min.   : 0.000   Failure - person got dizzy: 0   
 Traditional Sitting:49   1st Qu.: 0.000   Failure - too many OCs    : 2   
                          Median : 1.000   None                      :76   
                          Mean   : 1.286   Paresthesia               : 5   
                          3rd Qu.: 2.000   Wet Tap                   : 1   
                          Max.   :10.000                                   
B <- 10^4
bsm <- numeric(B)
for(i in 1:B){
  bss <- sample(EPIDURALC$oc, size = sum(!is.na(EPIDURALC$oc)), replace = TRUE)
  bsm[i] <- mean(bss)
}
(quantile(bsm, probs = c(0.03, 0.97)) -> BSPCI)
       3%       97% 
0.9047619 1.7261905 
# Infer pipeline
EPIDURALC %>% 
  specify(response = oc) %>% 
  generate(reps = B, type = "bootstrap") %>% 
  calculate(stat = "mean") -> bsmip
head(bsmip)
Response: oc (numeric)
# A tibble: 6 × 2
  replicate  stat
      <int> <dbl>
1         1  1.18
2         2  1.10
3         3  1.21
4         4  1.24
5         5  1.07
6         6  1.32
(get_confidence_interval(bsmip, level = 0.94) -> CI1)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    0.893     1.73
# The following computes the same thing
(quantile(bsmip$stat, probs = c(0.03, 0.97)) -> CI2)
       3%       97% 
0.8928571 1.7261905 

The 94% bootstrap percentile confidence interval for the mean number of obstructive contacts is: \(\text{CI}_{0.94}(\mu_{oc}) = [0.8928571, 1.7261905]\).


1.1 Visualize the Bootstrap Distribution

  1. Create a histogram of the bootstrap distribution of the mean number of obstructive contacts.
# Using base R with `bsm`
hist(bsm, xlab = substitute(paste(bar(X),"*")), main = "", col = "gray")

# Using ggplot2 with `bsm`
ggplot(data = data.frame(bsm = bsm), aes(x = bsm)) + 
  geom_histogram(binwidth = 0.1, fill = "gray", color = "black") + 
  theme_bw() + 
  labs(x = substitute(paste(bar(X),"*")))

# Using wrapper functions with bsmip (an infer object)
visualize(bsmip)

visualize(bsmip, fill = "purple") + 
  labs(x = substitute(paste(bar(X),"*")), 
       title = "Simulation-Based Bootstrap Distribution \n of the Mean Number of Obstructive Contacts") + 
  shade_confidence_interval(CI1, color = "red", fill = "lightblue") + 
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5)) 

  1. Create a 98% bootstrap percentile confidence interval for the mean fat of males in the BODYFAT data set from the PASWR2 package.
#Creating a vector with the Male fat values
BODYFAT %>% 
  filter(sex == "M") %>% 
  select(fat) %>% 
  pull() -> fat
fat
[1]  9.5  7.8 17.8 27.4
# Another approach using base R
subset(BODYFAT, subset = sex == "M", select = fat, drop = TRUE) -> fat2
fat2
[1]  9.5  7.8 17.8 27.4
# One more approach
BODYFAT$fat[BODYFAT$sex=="M"] -> fat3
fat3
[1]  9.5  7.8 17.8 27.4
B <- 10^4
bsmf <- numeric(B)
for(i in 1:B){
  bss <- sample(fat, size = sum(!is.na(fat)), replace = TRUE)
  bsmf[i] <- mean(bss)
}
(quantile(bsmf, probs = c(0.01, 0.99)) -> BSPCIF)
    1%    99% 
 8.225 25.000 
# Using infer
BODYFAT %>% 
  filter(sex == "M") %>% 
  specify(response = fat) %>% 
  generate(reps = B, type = "bootstrap") %>% 
  calculate(stat = "mean") -> IFM
(get_confidence_interval(IFM, level = 0.98) -> CIF1)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     8.22       25

1.2 Visualize the Bootstrap Distribution

visualize(IFM) +
  shade_confidence_interval(endpoints = CIF1)


2 Bootsrap Distribution for the Difference of Two Means

  1. Construct a 90% bootstrap percentile confidence interval for the difference between the mean obstructive contacts using the hamstring stretch position and the mean obstructive contacts using the traditional sitting position. (Use the EPIDURAL data set from PASWR2)
EPIDURALC %>% 
  filter(treatment == "Hamstring Stretch") %>% 
  select(oc)  %>% 
  pull() -> ochs # OCs for hamstring stretch
EPIDURALC %>% 
  filter(treatment == "Traditional Sitting") %>% 
  select(oc) %>% 
  pull() -> octs # OCs for traditional sitting
####
B <- 10^4
dm <- numeric(B)
for(i in 1:B){

  bsshs <- sample(ochs, size = sum(!is.na(ochs)), replace = TRUE)
  bssts <- sample(octs, size = sum(!is.na(octs)), replace = TRUE)
  dm[i] <- mean(bsshs) - mean(bssts)
}
(quantile(dm, probs = c(0.05, 0.95)) -> CIDM)
        5%        95% 
-0.3510204  1.2285714 

The 90% bootstrap percentile confidence interval for the difference in mean obstructive contacts for the hamstring stretch treatment minus the mean obstructive contacts for the traditional sitting position is: \(\text{CI}_{0.90}(\mu_{hs} - \mu _{ts})= [-0.3510204, 1.2285714]\).

EPIDURALC %>% 
  specify(oc ~ treatment) %>% 
  generate(reps = B, type = "bootstrap") %>% 
  calculate(stat = "diff in means", order = c("Hamstring Stretch", "Traditional Sitting")) -> BSDM
(get_confidence_interval(BSDM, level = 0.90) -> CIDM)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1   -0.355     1.22

2.1 Visualize the Bootstrap Distribution

visualize(BSDM) +
  shade_confidence_interval(endpoints = CIDM)


3 Bootstrap Percentile Confidence Interval for the Proportion

  1. Construct a 95% bootstrap percentile confidence interval for the true proportion of women who have no complications while receiving an epidural.
results <- EPIDURAL$complications=="None"
(phat <- mean(EPIDURAL$complications=="None"))
[1] 0.8941176
B <- 10^4
phatbs <- numeric(B)
for(i in 1:B){
  bss <- sample(results, size = sum(!is.na(results)), replace = TRUE)
  phatbs[i] <- mean(bss)
}
(quantile(phatbs, probs = c(0.025, 0.975)) -> CIP)
     2.5%     97.5% 
0.8235294 0.9529412 
EPIDURAL %>% 
  mutate(no_complications = (complications == "None")) %>% 
  specify(response = no_complications, success = "TRUE") %>% 
  generate(reps = 10^4, type = "bootstrap") %>% 
  calculate(stat = "prop") -> PS
get_confidence_interval(PS, level = 0.95)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1    0.824    0.953

4 Large Sample CI for \(p\)

(n <- sum(!is.na(results)))
[1] 85
phat*n
[1] 76
(1 - phat)*n
[1] 9
CI <- phat +c(-1, 1)*qnorm(.975)*sqrt(phat*(1-phat)/n)
CI
[1] 0.8287071 0.9595282
# OR
library(binom)
binom.confint(x = 76, n = 85, conf.level = 0.95, methods = "all")
          method  x  n      mean     lower     upper
1  agresti-coull 76 85 0.8941176 0.8087988 0.9453536
2     asymptotic 76 85 0.8941176 0.8287071 0.9595282
3          bayes 76 85 0.8895349 0.8226871 0.9511168
4        cloglog 76 85 0.8941176 0.8064044 0.9434493
5          exact 76 85 0.8941176 0.8084961 0.9504273
6          logit 76 85 0.8941176 0.8088548 0.9439819
7         probit 76 85 0.8941176 0.8135824 0.9458938
8        profile 76 85 0.8941176 0.8172229 0.9476443
9            lrt 76 85 0.8941176 0.8172260 0.9476675
10     prop.test 76 85 0.8941176 0.8038144 0.9474398
11        wilson 76 85 0.8941176 0.8108648 0.9432876

In Class Problem: Use the data BODYFAT from PASWR2 to construct a 95% percentile bootstrap confidence interval for \(\mu_{female\_fat} - \mu_{male\_fat}\).