Is there a relationship between pclass and survived?

TITANIC3 <- TITANIC3 %>% 
  mutate(survived = factor(survived, labels = c("Dead", "Alive")))
ggplot(data = TITANIC3, aes(x = survived, fill = pclass)) +
  geom_bar(position = "fill") +
  theme_bw() + 
  labs(y = "percent")

1 Chi-Square

Note: \(E_{ij} = \frac{R_i \times C_j}{n_T}\)

\[\chi^2_{obs} = \sum\frac{(O- E)^2}{E}\]

T1 <- TITANIC3 %>% 
  select(pclass, survived) %>% 
  table() 
T1
      survived
pclass Dead Alive
   1st  123   200
   2nd  158   119
   3rd  528   181
addmargins(T1)
      survived
pclass Dead Alive  Sum
   1st  123   200  323
   2nd  158   119  277
   3rd  528   181  709
   Sum  809   500 1309
E11 <- (323/1309)*(809/1309)*1309
E12 <- (323/1309)*(500/1309)*1309
E21 <- (277/1309)*(809/1309)*1309
E22 <- (277/1309)*(500/1309)*1309
E31 <- (709/1309)*(809/1309)*1309
E32 <- (709/1309)*(500/1309)*1309
EIJ <- matrix(c(E11, E12, E21, E22, E31, E32), byrow = TRUE, nrow = 3)
EIJ
         [,1]     [,2]
[1,] 199.6234 123.3766
[2,] 171.1940 105.8060
[3,] 438.1826 270.8174
X2 <- (123 - E11)^2/E11 + (200 - E12)^2/E12 + (158 - E21)^2/E21 + (119 - E22)^2/E22 +
      (528 - E31)^2/E31 + (181 - E32)^2/E32 
X2
[1] 127.8592
pvalue <- pchisq(X2, 2, lower = FALSE)
pvalue
[1] 1.720826e-28
chisq.test(T1, correct = FALSE)

    Pearson's Chi-squared test

data:  T1
X-squared = 127.86, df = 2, p-value < 2.2e-16
# Easier fashion to get expected
chisq.test(T1)$exp
      survived
pclass     Dead    Alive
   1st 199.6234 123.3766
   2nd 171.1940 105.8060
   3rd 438.1826 270.8174

1.1 Same thing with permutation

1.1.1 Using a for() loop

B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  PT <- xtabs(~pclass + sample(survived), data = TITANIC3)
  X2[i] <- chisq.test(PT, correct = FALSE)$stat
}
chi_obs_stat <- chisq.test(xtabs(~pclass + survived, data = TITANIC3))$stat
chi_obs_stat
X-squared 
 127.8592 
# OR
chi_obs_stat <- chisq.test(table(TITANIC3$pclass, TITANIC3$survived))$stat
chi_obs_stat
X-squared 
 127.8592 
#
pvalueSIM <- (sum(X2 >= chi_obs_stat) + 1)/(B + 1)
pvalueSIM
[1] 1e-04

What does the permutation distribution under the null hypothesis look like?

ggplot(data = data.frame(x = X2), aes(x = x)) +
  geom_density(fill = "pink") + 
  theme_bw() 

1.2 Same thing with infer

null <- TITANIC3 %>% 
  specify(pclass ~ survived) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq")

1.2.1 Visualize the null distribution

ggplot(data = null, aes(x = stat)) +
  geom_density(fill = "skyblue") +
  theme_bw()

visualize(null, method = "both", dens_color = "blue") + 
  theme_bw()

null %>% 
  summarize(pvalue = (sum(stat >= chi_obs_stat) + 1)/(10^3)) 
# A tibble: 1 × 1
  pvalue
   <dbl>
1  0.001

Note problem below

get_p_value(null, obs_stat = chi_obs_stat, direction = "right")
# A tibble: 1 × 1
  p_value
    <dbl>
1       0

2 Consider testing \(H_0:p_1 - p_2 = 0\)

TITANIC3 %>% 
  select(sex, survived) %>% 
  table() %>% 
  addmargins()
        survived
sex      Dead Alive  Sum
  female  127   339  466
  male    682   161  843
  Sum     809   500 1309
TITANIC3 %>% 
  select(sex, survived) %>% 
  table() %>% 
  prop.table(1)
        survived
sex           Dead     Alive
  female 0.2725322 0.7274678
  male   0.8090154 0.1909846

Consider testing: \(H_0:p_{\text{Male Alive}} - p_{\text{Female Alive}} = 0\) versus \(H_A:p_{\text{Male Alive}} - p_{\text{Female Alive}} < 0\)

prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)

    2-sample test for equality of proportions without continuity correction

data:  c(161, 339) out of c(843, 466)
X-squared = 365.89, df = 1, p-value < 2.2e-16
alternative hypothesis: less
95 percent confidence interval:
 -1.0000000 -0.4959006
sample estimates:
   prop 1    prop 2 
0.1909846 0.7274678 
obs_stat <- prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)$stat^.5
obs_stat
X-squared 
 19.12817 
# Same as
phatmale <- 161/843
phatfemale <- 339/466
phatp <- (161 + 339)/(843 + 466)
zobs <- (phatmale - phatfemale) / sqrt( phatp*(1 - phatp)/843 + phatp*(1 - phatp)/466)
zobs
[1] -19.12817
pvalue <- pnorm(zobs)
pvalue
[1] 7.357266e-82

Note: This can also be tested using the \(\chi^2\)-test of homogeneity.

\(H_0: p_{\text{Male Alive}} = p_{\text{Female Alive}} = p_{\text{Male Dead}} = p_{\text{Female Dead}}\) versus at least some \(p_{ij} \neq p_{i+1, j}\) for some \((i,j)\).

T1 <- TITANIC3 %>% 
  select(sex, survived) %>% 
  table()
chisq.test(T1, correct = FALSE)

    Pearson's Chi-squared test

data:  T1
X-squared = 365.89, df = 1, p-value < 2.2e-16
chisq.test(T1, correct = FALSE)$stat^.5
X-squared 
 19.12817 
null <- TITANIC3 %>% 
  specify(survived ~ sex, success = "Alive") %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq", order = c("male", "female"))
ggplot(data = null, aes(x  = stat)) + 
  geom_density(fill = "pink") +
  theme_bw()

null %>%
  summarize( (sum(stat >= obs_stat) + 1)/10^3)
# A tibble: 1 × 1
  `(sum(stat >= obs_stat) + 1)/10^3`
                               <dbl>
1                              0.001
B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  T3 <- xtabs(~sex + sample(survived), data = TITANIC3)
  X2[i] <- chisq.test(T3, correct = FALSE)$stat 
}
pvalue <- (sum(X2 >= obs_stat) + 1)/10^4
pvalue
[1] 1e-04

3 Chi-Square test of Homogeneity

Data will often come summarized in contingency tables.

DP <- c(67, 76, 57, 48, 73, 79)
MDP <- matrix(data = DP, nrow = 2, byrow = TRUE)
dimnames(MDP) <- list(Pop = c("Drug", "Placebo"), Status = c("Improve", "No Change", "Worse"))
TDP <- as.table(MDP)
TDP
         Status
Pop       Improve No Change Worse
  Drug         67        76    57
  Placebo      48        73    79

The problem then becomes one of putting the data back into a tidy format.

NT <- TDP %>% 
  tibble::as_tibble() %>% 
  uncount(n)
NT
# A tibble: 400 × 2
   Pop   Status 
   <chr> <chr>  
 1 Drug  Improve
 2 Drug  Improve
 3 Drug  Improve
 4 Drug  Improve
 5 Drug  Improve
 6 Drug  Improve
 7 Drug  Improve
 8 Drug  Improve
 9 Drug  Improve
10 Drug  Improve
# ℹ 390 more rows

3.1 Graph

ggplot(data = NT, aes(x = Pop, fill = Status)) +
  geom_bar(position = "fill") +
  theme_bw()

T1 <- NT %>% 
  table()
T1
         Status
Pop       Improve No Change Worse
  Drug         67        76    57
  Placebo      48        73    79
chisq.test(T1)

    Pearson's Chi-squared test

data:  T1
X-squared = 6.7584, df = 2, p-value = 0.03408
(chi_obs <- chisq.test(T1)$stat)
X-squared 
 6.758357 
E <- chisq.test(T1)$exp
E
         Status
Pop       Improve No Change Worse
  Drug       57.5      74.5    68
  Placebo    57.5      74.5    68
(X2 <- sum((T1 - E)^2/E))
[1] 6.758357
(pvalue <- pchisq(X2, 2, lower = FALSE))
[1] 0.03407544

3.2 Permutation distribution

B <- 10^3 - 1
X2 <- numeric(B)
for(i in 1:B){
  TN <- xtabs(~Pop + sample(Status), data = NT)
  X2[i] <- chisq.test(TN)$stat
}
null_hom <- data.frame(stat = X2)
ggplot(data = null_hom, aes(x = stat)) +
  geom_density(fill = "purple", alpha = 0.3) +
  theme_bw() + 
  stat_function(fun = dchisq, args = list(df = 2), color = "blue") 

(pval <- (sum(null_hom$stat >= chi_obs) + 1)/(B + 1))
[1] 0.03

3.3 Same approach with infer

null_hom_infer <- NT %>% 
  specify(Pop ~ Status) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq")
visualize(null_hom_infer, method = "both", dens_color = "blue") +
  theme_bw()

(pval2 <- (sum(null_hom_infer$stat >= chi_obs) + 1)/(B + 1))
[1] 0.035

3.4 GOF test

Given another table (from summarized information):

BD <- c(23, 20, 18, 23, 20, 19, 18, 21, 19, 22, 24, 29)
Sign = c("Aries", "Taurus", "Gemini", "Cancer", "Leo",
         "Virgo", "Libra", "Scorpio", "Sagitarius",
         "Capricorn", "Aquarius", "Pisces")
VEC <- rep(Sign, times = BD)
(T2 <- table(VEC))
VEC
  Aquarius      Aries     Cancer  Capricorn     Gemini        Leo      Libra 
        24         23         23         22         18         20         18 
    Pisces Sagitarius    Scorpio     Taurus      Virgo 
        29         19         21         20         19 
BNT <- T2 %>% 
  tibble::as_tibble() %>% 
  uncount(n)
BNT
# A tibble: 256 × 1
   VEC     
   <chr>   
 1 Aquarius
 2 Aquarius
 3 Aquarius
 4 Aquarius
 5 Aquarius
 6 Aquarius
 7 Aquarius
 8 Aquarius
 9 Aquarius
10 Aquarius
# ℹ 246 more rows

3.5 Graph

ggplot(data = BNT, aes(x = VEC)) +
  geom_bar(fill = rainbow(12)) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Astrological Sign")

chisq.test(T2, p = rep(1/12, 12))

    Chi-squared test for given probabilities

data:  T2
X-squared = 5.0938, df = 11, p-value = 0.9265
(pv <- chisq.test(T2, p = rep(1/12, 12))$p.value)
[1] 0.9265414

3.5.1 Conclusion

There is little evidence (p-value = 0.9265414) to suggest a nonuniform distribution of zodiac signs among executives.

3.6 Chi-Square Test of Independence

TH <- c(17, 35, 8, 53, 22, 491)
MTH <- matrix(data = TH, nrow = 3, byrow = TRUE)
dimnames(MTH) <- list(Tatoo = c("Parlor", "Elsewhere", "None"), HepC = c("Yes", "No"))
MTH <- as.table(MTH)
MTH
           HepC
Tatoo       Yes  No
  Parlor     17  35
  Elsewhere   8  53
  None       22 491
NHT <- MTH %>% 
  tibble::as_tibble() %>% 
  uncount(n)
NHT
# A tibble: 626 × 2
   Tatoo  HepC 
   <chr>  <chr>
 1 Parlor Yes  
 2 Parlor Yes  
 3 Parlor Yes  
 4 Parlor Yes  
 5 Parlor Yes  
 6 Parlor Yes  
 7 Parlor Yes  
 8 Parlor Yes  
 9 Parlor Yes  
10 Parlor Yes  
# ℹ 616 more rows
ggplot(data = NHT, aes( x = HepC, fill = Tatoo)) +
  geom_bar(position = "fill") +
  theme_bw() +
  scale_fill_manual(values = c("pink", "green", "red"))

(T1 <- xtabs(~HepC + Tatoo,data = NHT))
     Tatoo
HepC  Elsewhere None Parlor
  No         53  491     35
  Yes         8   22     17
chisq.test(T1)

    Pearson's Chi-squared test

data:  T1
X-squared = 57.912, df = 2, p-value = 2.658e-13
(obs_stat <- chisq.test(T1)$stat)
X-squared 
 57.91217 
B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  TT <- xtabs(~HepC + sample(Tatoo), data = NHT)
  X2[i] <- chisq.test(TT)$stat
}
(pval <- (sum(X2 >= obs_stat) + 1)/(10^4 -1 + 1))
[1] 1e-04