1 Various pipelines

To read about the mtcars data set, type ?mtcars at the R prompt.

str(mtcars)
'data.frame':   32 obs. of  11 variables:
 $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
 $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
 $ disp: num  160 160 108 258 360 ...
 $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
 $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
 $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
 $ qsec: num  16.5 17 18.6 19.4 17 ...
 $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
 $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
 $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
 $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
DT::datatable(mtcars)

We want to test \(H_0: \mu_{\text{Manual}} - \mu_{\text{Automatic}} = 0\) versus \(H_A: \mu_{\text{Manual}} - \mu_{\text{Automatic}} > 0\)

To do so, first change the variable am to a factor and reorder the levels of the variable.

mtcars <- mtcars %>% 
  mutate(am = factor(am, levels = c("1","0"), labels = c("Manual", "Automatic")),
         cyl = factor(cyl, levels = c("4", "6", "8")))
mtcars %>% 
  group_by(am) %>% 
  summarize(Mean = mean(mpg), SD = sd(mpg), n = n())
# A tibble: 2 × 4
  am         Mean    SD     n
  <fct>     <dbl> <dbl> <int>
1 Manual     24.4  6.17    13
2 Automatic  17.1  3.83    19
mtcars %>% 
  group_by(cyl) %>% 
  summarize(Mean = mean(mpg), SD = sd(mpg), n = n())
# A tibble: 3 × 4
  cyl    Mean    SD     n
  <fct> <dbl> <dbl> <int>
1 4      26.7  4.51    11
2 6      19.7  1.45     7
3 8      15.1  2.56    14
p1 <- ggplot(data = mtcars, aes(x = mpg)) +
  geom_histogram(binwidth = 3, fill = "lightblue", color = "black") +
  facet_wrap(~am) + 
  theme_bw()
#
p2 <- ggplot(data = mtcars, aes(x = mpg)) +
  geom_density(fill = "lightblue", color = "black") +
  facet_wrap(~am) + 
  theme_bw()
p3 <- ggplot(data = mtcars, aes(sample = mpg, color = am)) + 
  geom_qq() + 
  facet_wrap(.~am) + 
  theme_bw()
#
p4 <- ggplot(data = mtcars, aes(sample = mpg, color = am)) + 
  geom_qq() + 
  theme_bw() + 
  scale_color_manual(values = c("blue", "red"))
gridExtra::grid.arrange(p1, p3, p2, p4, ncol = 2)

# or
library(patchwork)
p1 + p3 + p2 + p4 + plot_layout(ncol = 2)

SIMS <- 1000  # set this 
library(infer)
mtcars %>%
    specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
    hypothesize(null = "independence") %>%
    generate(reps = SIMS, type = "permute") %>%
    calculate(stat = "t", order = c("Manual", "Automatic")) %>%  # 0 = automatic, 1 = manual
    visualize(method = "both") #"simulation" is the default method

mtcars %>%
    specify(response = mpg, explanatory = am) %>% # alt: mpg ~ am
    hypothesize(null = "independence") %>%
    generate(reps = SIMS, type = "permute") %>%
    calculate(stat = "t", order = c("Manual", "Automatic")) %>%  # 0 = automatic, 1 = manual
    visualize(method = "both") #"simulation" is the default method

TS <- mtcars %>%
    specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
    hypothesize(null = "independence") %>%
    generate(reps = SIMS, type = "permute") %>%
    calculate(stat = "t", order = c("Manual", "Automatic")) # 0 = automatic, 1 = manual
#####
obs_stat <- t.test(mpg ~ am, data = mtcars)$stat
obs_stat
       t 
3.767123 
####
ggplot(data =TS, aes(x = stat, y = ..density..)) +
  geom_histogram(fill = "lightblue", color = "black", binwidth = 0.3) + 
  theme_bw() + 
  geom_vline(xintercept = obs_stat, linetype = "dashed") 

####
get_p_value(TS, obs_stat, direction = "greater")
# A tibble: 1 × 1
  p_value
    <dbl>
1       0
####
pvalue <- (sum(TS$stat >= obs_stat) + 1)/(SIMS + 1)
pvalue
[1] 0.000999001

1.1 Using a for() loop

sims <- SIMS
ts <- numeric(sims)
for(i in 1:sims){
  ts[i] <- t.test(mpg ~ sample(am), data = mtcars)$stat
}
pvalue <- (sum(ts >= obs_stat) + 1)/(sims + 1)
pvalue
[1] 0.000999001

1.2 Using Difference in Means

DM <- mtcars %>%
    specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
    hypothesize(null = "independence") %>%
    generate(reps = SIMS, type = "permute") %>%
    calculate(stat = "diff in means", order = c("Manual", "Automatic")) # 0 = automatic, 1 = manual
#####
obs_MDIFF <- mtcars %>% 
  group_by(am) %>% 
  summarize(MD = mean(mpg)) %>% 
  mutate(MeanDiff = -diff(MD)) %>% 
  pull()
obs_stat <- obs_MDIFF[1]
obs_stat
[1] 7.244939
get_p_value(DM, obs_stat = obs_stat, direction = "greater")
# A tibble: 1 × 1
  p_value
    <dbl>
1       0
pvalue <- mean(DM$stat >= obs_stat)
pvalue
[1] 0
# Better yet
pvalue1 <- (sum(DM$stat >= obs_stat) + 1)/(SIMS + 1)
pvalue1
[1] 0.000999001

1.2.1 With a for loop

sims <- SIMS - 1
dm <- numeric(sims)
for(i in 1:sims){
  dm[i] <- -diff(tapply(mtcars$mpg, sample(mtcars$am), mean))
}
ggplot(data = data.frame(stat = dm), aes(x = stat)) + 
  geom_density(fill = "purple") + 
  theme_bw() + 
  geom_vline(xintercept = obs_stat, linetype = "dashed")

pvalue <- (sum(dm >= obs_stat) + 1)/(sims + 1)
pvalue
[1] 0.001

1.3 Confidence Intervals

Construct a 92% CI for the true mean quarter mile time.

BM <- specify(mtcars, response = qsec) %>% 
  generate(reps = SIMS, type = "bootstrap") %>% 
  calculate(stat = "mean")
visualize(BM)

# Bootstrap Percentile CI
PCI <- BM %>%
  summarize(l = quantile(stat, 0.04),
            u = quantile(stat, 0.96)) 
PCI
# A tibble: 1 × 2
      l     u
  <dbl> <dbl>
1  17.3  18.4
### OR
(quantile(BM$stat, probs = c(0.04, 0.96)) -> PCI1)
      4%      96% 
17.32029 18.43804 
###
# Visualize
BM %>% visualize() + 
  shade_confidence_interval(endpoints = PCI, color = "red", fill = "pink") + 
  theme_bw()

####
ggplot(data = BM, aes(x = stat, fill = stat > PCI$l & stat < PCI$u) ) +
  geom_histogram(binwidth = 0.15) + 
  scale_fill_manual(values = c("blue", "green")) +
  theme_bw()

# Bootstrap SE CI
(n <- sum(!is.na(mtcars$qsec)))
[1] 32
(multiplier <- qt(0.96, n - 1))
[1] 1.810002
(SEBM <- sd(BM$stat))
[1] 0.3174961
(CI <- mean(mtcars$qsec) + c(-1, 1)*multiplier*SEBM)
[1] 17.27408 18.42342
#
# Visualize
BM %>% visualize() + 
  shade_confidence_interval(endpoints = CI, color = "red", fill = "pink") + 
  theme_bw()

1.3.1 Using a for() loop for CI

sims <- SIMS
ME <- numeric(sims)
for(i in 1:sims){
  bss <- sample(mtcars$qsec, size = sum(!is.na(mtcars$qsec)), replace = TRUE)
  ME[i] <- mean(bss)
}
# Percentile CI
(PCI <- quantile(ME, probs = c(0.04, 0.96)))
      4%      96% 
17.30015 18.36313 
# SE CI
(multiplier <- qt(0.96, 32 - 1))
[1] 1.810002
(SCI <- mean(mtcars$qsec) + c(-1, 1)*multiplier*sd((ME)))
[1] 17.28346 18.41404

1.4 ANOVA

mtcars %>% 
  group_by(cyl) %>% 
  summarize(Mean = mean(mpg), SD = sd(mpg), n = n())
# A tibble: 3 × 4
  cyl    Mean    SD     n
  <fct> <dbl> <dbl> <int>
1 4      26.7  4.51    11
2 6      19.7  1.45     7
3 8      15.1  2.56    14
(GM <- mean(mtcars$mpg))
[1] 20.09062
(MT1 <- mean(mtcars$mpg[mtcars$cyl == "4"]))
[1] 26.66364
(MT2 <- mean(mtcars$mpg[mtcars$cyl == "6"]))
[1] 19.74286
(MT3 <- mean(mtcars$mpg[mtcars$cyl == "8"]))
[1] 15.1
(SSTreat <- 11*(MT1 - GM)^2 + 7*(MT2 - GM)^2 + 14*(MT3 - GM)^2)
[1] 824.7846
(SSTotal <- sum((mtcars$mpg - GM)^2))
[1] 1126.047
(SSError <- SSTotal - SSTreat)
[1] 301.2626
# Checks
ggplot(data = mtcars, aes(sample = mpg)) +
  geom_qq() +
  facet_wrap(vars(cyl)) + 
  theme_bw()

# Questionable whether the assumption of constant variance across treatments
# is satisfied
ggplot(data = mtcars, aes(x = cyl, y = mpg)) +
  geom_boxplot() + 
  theme_bw()

# If assumptions are satisfied use what follows
mod.aov <- aov(mpg ~ cyl, data = mtcars)
summary(mod.aov)
            Df Sum Sq Mean Sq F value   Pr(>F)    
cyl          2  824.8   412.4    39.7 4.98e-09 ***
Residuals   29  301.3    10.4                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
obs_stat <- summary(mod.aov)[[1]][1, 4]
obs_stat
[1] 39.69752
# OR
obs_stat <- anova(lm(mpg~cyl, data = mtcars))[1, 4]
obs_stat
[1] 39.69752

1.5 Using a resampling approach to test equality of multiple means

\(H_0:\mu_4 = \mu_6 = \mu_8\) versus \(H_A: \mu_i \neq \mu_j \text{ for some } (i, j)\)

1.5.1 infer approach

FS <- mtcars %>% 
  specify(mpg ~ cyl) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = SIMS, type = "permute") %>% 
  calculate(stat = "F")
visualize(FS, method = "both")

get_p_value(FS, obs_stat = obs_stat, direction = "greater")
# A tibble: 1 × 1
  p_value
    <dbl>
1       0

1.5.2 for() loop

sims <- SIMS -1
FFS <- numeric(sims)
for(i in 1:sims){
  FFS[i] <- summary(aov(mpg ~ sample(cyl), data = mtcars))[[1]][1, 4]
}
pvalue <- (sum(FFS >= obs_stat) + 1)/(sims + 1)
pvalue
[1] 0.001
ggplot(data = data.frame(stat = FFS), aes(x = stat)) +
  geom_density(color = "red", fill = "tomato") + 
  theme_bw() + 
  stat_function(fun = df, args = list(2, 29), color = "blue")

1.6 Testing a single parameter with a bootstrap approach

Consider testing \(H_0:\mu_{qsec} = 17\) versus \(H_A:\mu_{qsec} \neq 17\)

(xbar <- mean(mtcars$qsec))
[1] 17.84875
(null <- 17 - xbar)
[1] -0.84875
sims <- SIMS
bm <- numeric(sims)
for(i in 1:sims){
  bss <- sample(mtcars$qsec, size = 32, replace = TRUE) + null
  bm[i] <- mean(bss)
}
# Base histogram
hist(bm, breaks = "Scott", col = "lightblue")

# ggplot2 histogram
ggplot(data = data.frame(x = bm), aes(x = x)) +
  geom_histogram(color = "black", fill = "lightblue", binwidth = 0.1) + theme_bw()

pvalue <- (sum(bm >= xbar)*2 + 1)/(sims + 1)
pvalue
[1] 0.01298701
# Or
pvalue2 <- (sum(bm >= 17 - null) + sum(bm <= 17 + null) + 1)/(sims + 1)
pvalue2
[1] 0.00999001
#
t.test(mtcars$qsec, mu = 17, alternative = "two.sided")

    One Sample t-test

data:  mtcars$qsec
t = 2.6869, df = 31, p-value = 0.01149
alternative hypothesis: true mean is not equal to 17
95 percent confidence interval:
 17.20449 18.49301
sample estimates:
mean of x 
 17.84875 

1.7 Example 5.4—page 120 of MSWR

Do men take more physical risks in the presence of an attractive woman? Two psychologists in Australia conducted an experiment to explore this question (Ronay and von Hippel (2009)). Male skateboarders between the ages of 18 and 35 years were randomly assigned to perform tricks in the presence of an attractive 18-year-old female experimenter or a male experimenter. The two experimenters, both of whom were blind to the hypotheses, video taped the session. At the end of the experiment, the researchers collected saliva for the participants and measured testosterone levels.

Construct a 95% confidence interval for \(\mu_{fp}- \mu_{mp}\) (mean testosterone with attractive female present minus the mean testosterone with a male present) using the infer pipeline and 1000 bootstrap samples.

library(resampledata)
str(Skateboard)
'data.frame':   71 obs. of  3 variables:
 $ Age         : int  18 18 18 18 19 19 18 18 18 19 ...
 $ Experimenter: Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
 $ Testosterone: num  206 197 136 170 107 ...
MT <- Skateboard %>% 
  specify(Testosterone ~ Experimenter) %>% 
  generate(reps = 1000, type = "bootstrap") %>% 
  calculate(stat = "diff in means", order = c("Female", "Male"))
###
PCI <- MT %>%
  summarize(l = quantile(stat, 0.025),
            u = quantile(stat, 0.975)) 
PCI
# A tibble: 1 × 2
      l     u
  <dbl> <dbl>
1  29.7  142.
### Or
(quantile(MT$stat, probs = c(0.025, 0.975)) -> PCIa)
     2.5%     97.5% 
 29.74435 142.24249 
### Or
(get_confidence_interval(MT, level = 0.95) -> PCIb)
# A tibble: 1 × 2
  lower_ci upper_ci
     <dbl>    <dbl>
1     29.7     142.
###
MT %>% visualize() + 
  shade_confidence_interval(endpoints = PCI, color = "red", fill = "pink") + 
  theme_bw()

  • Construct a 95% confidence interval for \(\mu_{fp}- \mu_{mp}\) using a for() loop with 1000 bootstrap samples.
testF <- subset(Skateboard, select = Testosterone, 
                subset = Experimenter == "Female", drop = TRUE)
(nF <- length(testF))
[1] 49
testM <- subset(Skateboard, select = Testosterone, 
                subset = Experimenter == "Male", drop = TRUE)
(nM <- length(testM))
[1] 22
B <- 1000
diff_means <- numeric(B)
for(i in 1:B){
  bss1 <- sample(testF, size = nF, replace = TRUE)
  bss2 <- sample(testM, size = nM, replace = TRUE)
  diff_means[i]  <- mean(bss1) - mean(bss2)
}
(PCI <- quantile(diff_means, probs = c(0.025, 0.975)))
    2.5%    97.5% 
 27.6917 137.9117 
hist(diff_means, breaks = "Scott", col = "lightblue", 
     main= "Bootstrap Distribution", xlab = expression(bar(x)[fp] - bar(x)[mp]))
abline(v = PCI, col = "purple", lty = "dashed")

  • Test the null hypothesis \(H_0: \mu_{fp}- \mu_{mp} = 0\) versus \(H_A: \mu_{fp}- \mu_{mp} > 0\) using the infer pipeline and 1000 replications.
obs_MDIFF <- Skateboard %>% 
  group_by(Experimenter) %>% 
  summarize(MD = mean(Testosterone)) %>% 
  mutate(MeanDiff = -diff(MD)) %>% 
  pull()
obs_MDIFF <- obs_MDIFF[1]
obs_MDIFF
[1] 83.0692
MD <- Skateboard %>% 
  specify(Testosterone ~ Experimenter) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "diff in means", order = c("Female", "Male"))
get_p_value(MD, obs_stat = obs_MDIFF, direction = "right")
# A tibble: 1 × 1
  p_value
    <dbl>
1   0.005
  • Test the null hypothesis \(H_0: \mu_{fp}- \mu_{mp} = 0\) versus \(H_A: \mu_{fp}- \mu_{mp} > 0\) using a for() loop with 1000 replications.
(obs_mean_diff <- -diff(tapply(Skateboard$Testosterone, Skateboard$Experimenter, mean)))
   Male 
83.0692 
R <- 1000
stat <- numeric(R)
for(i in 1:R){
  stat[i] <- -diff(tapply(Skateboard$Testosterone, sample(Skateboard$Experimenter), mean))
}
(pvalue <- mean(stat >= obs_mean_diff))
[1] 0.005
  • Test the null hypothesis \(H_0: \mu_{fp}- \mu_{mp} = 0\) versus \(H_A: \mu_{fp}- \mu_{mp} > 0\) using the infer pipeline with 1000 replications and a t-test statistic.
(obs_t <- t.test(Testosterone ~ Experimenter, data = Skateboard)$stat)
       t 
2.783224 
TS <- Skateboard %>% 
  specify(Testosterone ~ Experimenter) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 1000, type = "permute") %>% 
  calculate(stat = "t", order = c("Female", "Male"))
get_p_value(TS, obs_stat = obs_t, direction = "right")
# A tibble: 1 × 1
  p_value
    <dbl>
1   0.008
visualize(TS, method = "both", obs_stat = obs_t, direction = "right", pvalue_fill = "purple")

get_p_value(TS, obs_stat = obs_t, direction = "right")
# A tibble: 1 × 1
  p_value
    <dbl>
1   0.008
  • Test the null hypothesis \(H_0: \mu_{fp}- \mu_{mp} = 0\) versus \(H_A: \mu_{fp}- \mu_{mp} > 0\) using a for() loop with 1000 replications and a t-test statistic.
R <- 1000
t_stat <- numeric(R)
for(i in 1:R){
t_stat[i] <- t.test(Testosterone ~ sample(Experimenter), data = Skateboard)$stat  
}
hist(t_stat)
abline(v = obs_t)

(pvalue <- mean(t_stat >= obs_t))
[1] 0.005
  • Simulate a \(t_{n-1}\) distribution where \(n = 4\) with a for() loop
R <- 10000
t_stat <- numeric(R)
for(i in 1:R){
  rs <- rnorm(4, 0, 1)
  t_stat[i] <- (mean(rs) - 0)/(sd(rs)/sqrt(4))
}
hist(t_stat, breaks = "Scott", freq = FALSE, col = "lightblue", main = "", xlim = c(-8, 8), ylim = c(0, .5))
curve(dt(x, 4), -8, 8, add = TRUE, col = "blue")
curve(dnorm(x), -8, 8, add = TRUE, col = "red", lty = "dashed")