Is there a relationship between pclass and survived?

TITANIC3 <- TITANIC3 %>% 
  mutate(survived = factor(survived, labels = c("Dead", "Alive")))
ggplot(data = TITANIC3, aes(x = survived, fill = pclass)) +
  geom_bar(position = "fill") +
  theme_bw() + 
  labs(y = "percent")

1 Chi-Square

Note: \(E_{ij} = \frac{R_i \times C_j}{n_T}\)

\[\chi^2_{obs} = \sum\frac{(O- E)^2}{E}\]

T1 <- TITANIC3 %>% 
  select(pclass, survived) %>% 
  table() 
T1

      survived
pclass Dead Alive
   1st  123   200
   2nd  158   119
   3rd  528   181

addmargins(T1)

      survived
pclass Dead Alive  Sum
   1st  123   200  323
   2nd  158   119  277
   3rd  528   181  709
   Sum  809   500 1309

E11 <- (323/1309)*(809/1309)*1309
E12 <- (323/1309)*(500/1309)*1309
E21 <- (277/1309)*(809/1309)*1309
E22 <- (277/1309)*(500/1309)*1309
E31 <- (709/1309)*(809/1309)*1309
E32 <- (709/1309)*(500/1309)*1309
EIJ <- matrix(c(E11, E12, E21, E22, E31, E32), byrow = TRUE, nrow = 3)
EIJ

         [,1]     [,2]
[1,] 199.6234 123.3766
[2,] 171.1940 105.8060
[3,] 438.1826 270.8174

X2 <- (123 - E11)^2/E11 + (200 - E12)^2/E12 + (158 - E21)^2/E21 + (119 - E22)^2/E22 +
      (528 - E31)^2/E31 + (181 - E32)^2/E32 
X2

[1] 127.8592

pvalue <- pchisq(X2, 2, lower = FALSE)
pvalue

[1] 1.720826e-28

chisq.test(T1, correct = FALSE)


    Pearson's Chi-squared test

data:  T1
X-squared = 127.86, df = 2, p-value < 2.2e-16

# Easier fashion to get expected
chisq.test(T1)$exp

      survived
pclass     Dead    Alive
   1st 199.6234 123.3766
   2nd 171.1940 105.8060
   3rd 438.1826 270.8174

1.1 Same thing with permutation

1.1.1 Using a `for()` loop

B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  PT <- xtabs(~pclass + sample(survived), data = TITANIC3)
  X2[i] <- chisq.test(PT, correct = FALSE)$stat
}
chi_obs_stat <- chisq.test(xtabs(~pclass + survived, data = TITANIC3))$stat
chi_obs_stat

X-squared 
 127.8592

# OR
chi_obs_stat <- chisq.test(table(TITANIC3$pclass, TITANIC3$survived))$stat
chi_obs_stat

X-squared 
 127.8592

#
pvalueSIM <- (sum(X2 >= chi_obs_stat) + 1)/(B + 1)
pvalueSIM

[1] 1e-04

What does the permutation distribution under the null hypothesis look like?

ggplot(data = data.frame(x = X2), aes(x = x)) +
  geom_density(fill = "pink") + 
  theme_bw()

1.2 Same thing with `infer`

null <- TITANIC3 %>% 
  specify(pclass ~ survived) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq")

1.2.1 Visualize the null distribution

ggplot(data = null, aes(x = stat)) +
  geom_density(fill = "skyblue") +
  theme_bw()

visualize(null, method = "both", dens_color = "blue") + 
  theme_bw()

null %>% 
  summarize(pvalue = (sum(stat >= chi_obs_stat) + 1)/(10^3))

# A tibble: 1 × 1
  pvalue
   <dbl>
1  0.001

Note problem below

get_p_value(null, obs_stat = chi_obs_stat, direction = "right")

# A tibble: 1 × 1
  p_value
    <dbl>
1       0

2 Consider testing \(H_0:p_1 - p_2 = 0\)

TITANIC3 %>% 
  select(sex, survived) %>% 
  table() %>% 
  addmargins()

        survived
sex      Dead Alive  Sum
  female  127   339  466
  male    682   161  843
  Sum     809   500 1309

TITANIC3 %>% 
  select(sex, survived) %>% 
  table() %>% 
  prop.table(1)

        survived
sex           Dead     Alive
  female 0.2725322 0.7274678
  male   0.8090154 0.1909846

Consider testing: \(H_0:p_{\text{Male Alive}} - p_{\text{Female Alive}} = 0\) versus \(H_A:p_{\text{Male Alive}} - p_{\text{Female Alive}} < 0\)

prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)


    2-sample test for equality of proportions without continuity correction

data:  c(161, 339) out of c(843, 466)
X-squared = 365.89, df = 1, p-value < 2.2e-16
alternative hypothesis: less
95 percent confidence interval:
 -1.0000000 -0.4959006
sample estimates:
   prop 1    prop 2 
0.1909846 0.7274678

obs_stat <- prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)$stat^.5
obs_stat

X-squared 
 19.12817

# Same as
phatmale <- 161/843
phatfemale <- 339/466
phatp <- (161 + 339)/(843 + 466)
zobs <- (phatmale - phatfemale) / sqrt( phatp*(1 - phatp)/843 + phatp*(1 - phatp)/466)
zobs

[1] -19.12817

pvalue <- pnorm(zobs)
pvalue

[1] 7.357266e-82

Note: This can also be tested using the \(\chi^2\)-test of homogeneity.

\(H_0: p_{\text{Male Alive}} = p_{\text{Female Alive}} = p_{\text{Male Dead}} = p_{\text{Female Dead}}\) versus at least some \(p_{ij} \neq p_{i+1, j}\) for some \((i,j)\).

T1 <- TITANIC3 %>% 
  select(sex, survived) %>% 
  table()
chisq.test(T1, correct = FALSE)


    Pearson's Chi-squared test

data:  T1
X-squared = 365.89, df = 1, p-value < 2.2e-16

chisq.test(T1, correct = FALSE)$stat^.5

X-squared 
 19.12817

null <- TITANIC3 %>% 
  specify(survived ~ sex, success = "Alive") %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq", order = c("male", "female"))
ggplot(data = null, aes(x  = stat)) + 
  geom_density(fill = "pink") +
  theme_bw()

null %>%
  summarize( (sum(stat >= obs_stat) + 1)/10^3)

# A tibble: 1 × 1
  `(sum(stat >= obs_stat) + 1)/10^3`
                               <dbl>
1                              0.001

B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  T3 <- xtabs(~sex + sample(survived), data = TITANIC3)
  X2[i] <- chisq.test(T3, correct = FALSE)$stat 
}
pvalue <- (sum(X2 >= obs_stat) + 1)/10^4
pvalue

[1] 1e-04

3 Chi-Square test of Homogeneity

Data will often come summarized in contingency tables.

DP <- c(67, 76, 57, 48, 73, 79)
MDP <- matrix(data = DP, nrow = 2, byrow = TRUE)
dimnames(MDP) <- list(Pop = c("Drug", "Placebo"), Status = c("Improve", "No Change", "Worse"))
TDP <- as.table(MDP)
TDP

         Status
Pop       Improve No Change Worse
  Drug         67        76    57
  Placebo      48        73    79

The problem then becomes one of putting the data back into a tidy format.

NT <- TDP %>% 
  tibble::as_tibble() %>% 
  uncount(n)
NT

# A tibble: 400 × 2
   Pop   Status 
   <chr> <chr>  
 1 Drug  Improve
 2 Drug  Improve
 3 Drug  Improve
 4 Drug  Improve
 5 Drug  Improve
 6 Drug  Improve
 7 Drug  Improve
 8 Drug  Improve
 9 Drug  Improve
10 Drug  Improve
# ℹ 390 more rows

3.1 Graph

ggplot(data = NT, aes(x = Pop, fill = Status)) +
  geom_bar(position = "fill") +
  theme_bw()

T1 <- NT %>% 
  table()
T1

         Status
Pop       Improve No Change Worse
  Drug         67        76    57
  Placebo      48        73    79

chisq.test(T1)


    Pearson's Chi-squared test

data:  T1
X-squared = 6.7584, df = 2, p-value = 0.03408

(chi_obs <- chisq.test(T1)$stat)

X-squared 
 6.758357

E <- chisq.test(T1)$exp
E

         Status
Pop       Improve No Change Worse
  Drug       57.5      74.5    68
  Placebo    57.5      74.5    68

(X2 <- sum((T1 - E)^2/E))

[1] 6.758357

(pvalue <- pchisq(X2, 2, lower = FALSE))

[1] 0.03407544

3.2 Permutation distribution

B <- 10^3 - 1
X2 <- numeric(B)
for(i in 1:B){
  TN <- xtabs(~Pop + sample(Status), data = NT)
  X2[i] <- chisq.test(TN)$stat
}
null_hom <- data.frame(stat = X2)
ggplot(data = null_hom, aes(x = stat)) +
  geom_density(fill = "purple", alpha = 0.3) +
  theme_bw() + 
  stat_function(fun = dchisq, args = list(df = 2), color = "blue")

(pval <- (sum(null_hom$stat >= chi_obs) + 1)/(B + 1))

[1] 0.029

3.3 Same approach with `infer`

null_hom_infer <- NT %>% 
  specify(Pop ~ Status) %>% 
  hypothesize(null = "independence") %>% 
  generate(reps = 10^3 - 1, type = "permute") %>% 
  calculate(stat = "Chisq")
visualize(null_hom_infer, method = "both", dens_color = "blue") +
  theme_bw()

(pval2 <- (sum(null_hom_infer$stat >= chi_obs) + 1)/(B + 1))

[1] 0.035

3.4 GOF test

Given another table (from summarized information):

BD <- c(23, 20, 18, 23, 20, 19, 18, 21, 19, 22, 24, 29)
Sign = c("Aries", "Taurus", "Gemini", "Cancer", "Leo",
         "Virgo", "Libra", "Scorpio", "Sagitarius",
         "Capricorn", "Aquarius", "Pisces")
VEC <- rep(Sign, times = BD)
(T2 <- table(VEC))

VEC
  Aquarius      Aries     Cancer  Capricorn     Gemini        Leo      Libra 
        24         23         23         22         18         20         18 
    Pisces Sagitarius    Scorpio     Taurus      Virgo 
        29         19         21         20         19

BNT <- T2 %>% 
  tibble::as_tibble() %>% 
  uncount(n)
BNT

# A tibble: 256 × 1
   VEC     
   <chr>   
 1 Aquarius
 2 Aquarius
 3 Aquarius
 4 Aquarius
 5 Aquarius
 6 Aquarius
 7 Aquarius
 8 Aquarius
 9 Aquarius
10 Aquarius
# ℹ 246 more rows

3.5 Graph

ggplot(data = BNT, aes(x = VEC)) +
  geom_bar(fill = rainbow(12)) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  labs(x = "Astrological Sign")

chisq.test(T2, p = rep(1/12, 12))


    Chi-squared test for given probabilities

data:  T2
X-squared = 5.0938, df = 11, p-value = 0.9265

(pv <- chisq.test(T2, p = rep(1/12, 12))$p.value)

[1] 0.9265414

3.5.1 Conclusion

There is little evidence (p-value = 0.9265414) to suggest a nonuniform distribution of zodiac signs among executives.

3.6 Chi-Square Test of Independence

TH <- c(17, 35, 8, 53, 22, 491)
MTH <- matrix(data = TH, nrow = 3, byrow = TRUE)
dimnames(MTH) <- list(Tatoo = c("Parlor", "Elsewhere", "None"), HepC = c("Yes", "No"))
MTH <- as.table(MTH)
MTH

           HepC
Tatoo       Yes  No
  Parlor     17  35
  Elsewhere   8  53
  None       22 491

NHT <- MTH %>% 
  tibble::as_tibble() %>% 
  uncount(n)
NHT

# A tibble: 626 × 2
   Tatoo  HepC 
   <chr>  <chr>
 1 Parlor Yes  
 2 Parlor Yes  
 3 Parlor Yes  
 4 Parlor Yes  
 5 Parlor Yes  
 6 Parlor Yes  
 7 Parlor Yes  
 8 Parlor Yes  
 9 Parlor Yes  
10 Parlor Yes  
# ℹ 616 more rows

ggplot(data = NHT, aes( x = HepC, fill = Tatoo)) +
  geom_bar(position = "fill") +
  theme_bw() +
  scale_fill_manual(values = c("pink", "green", "red"))

(T1 <- xtabs(~HepC + Tatoo,data = NHT))

     Tatoo
HepC  Elsewhere None Parlor
  No         53  491     35
  Yes         8   22     17

chisq.test(T1)


    Pearson's Chi-squared test

data:  T1
X-squared = 57.912, df = 2, p-value = 2.658e-13

(obs_stat <- chisq.test(T1)$stat)

X-squared 
 57.91217

B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
  TT <- xtabs(~HepC + sample(Tatoo), data = NHT)
  X2[i] <- chisq.test(TT)$stat
}
(pval <- (sum(X2 >= obs_stat) + 1)/(10^4 -1 + 1))

[1] 1e-04

Chi-Square

Notes

Last compiled: Tuesday, November 26, 2024 - 09:16:00 AM.

1 Chi-Square

1.1 Same thing with permutation

1.1.1 Using a `for()` loop

1.2 Same thing with `infer`

1.2.1 Visualize the null distribution

2 Consider testing \(H_0:p_1 - p_2 = 0\)

3 Chi-Square test of Homogeneity

3.1 Graph

3.2 Permutation distribution

3.3 Same approach with `infer`

3.4 GOF test

3.5 Graph

3.5.1 Conclusion

3.6 Chi-Square Test of Independence

Chi-Square

Notes

Last compiled: Tuesday, November 26, 2024 - 09:16:00 AM.

1 Chi-Square

1.1 Same thing with permutation

1.1.1 Using a for() loop

1.2 Same thing with infer

1.2.1 Visualize the null distribution

2 Consider testing \(H_0:p_1 - p_2 = 0\)

3 Chi-Square test of Homogeneity

3.1 Graph

3.2 Permutation distribution

3.3 Same approach with infer

3.4 GOF test

3.5 Graph

3.5.1 Conclusion

3.6 Chi-Square Test of Independence

1.1.1 Using a `for()` loop

1.2 Same thing with `infer`

3.3 Same approach with `infer`