Is there a relationship between pclass
and survived
?
TITANIC3 <- TITANIC3 %>%
mutate(survived = factor(survived, labels = c("Dead", "Alive")))
ggplot(data = TITANIC3, aes(x = survived, fill = pclass)) +
geom_bar(position = "fill") +
theme_bw() +
labs(y = "percent")
Note: \(E_{ij} = \frac{R_i \times C_j}{n_T}\)
\[\chi^2_{obs} = \sum\frac{(O- E)^2}{E}\]
T1 <- TITANIC3 %>%
select(pclass, survived) %>%
table()
T1
survived
pclass Dead Alive
1st 123 200
2nd 158 119
3rd 528 181
addmargins(T1)
survived
pclass Dead Alive Sum
1st 123 200 323
2nd 158 119 277
3rd 528 181 709
Sum 809 500 1309
E11 <- (323/1309)*(809/1309)*1309
E12 <- (323/1309)*(500/1309)*1309
E21 <- (277/1309)*(809/1309)*1309
E22 <- (277/1309)*(500/1309)*1309
E31 <- (709/1309)*(809/1309)*1309
E32 <- (709/1309)*(500/1309)*1309
EIJ <- matrix(c(E11, E12, E21, E22, E31, E32), byrow = TRUE, nrow = 3)
EIJ
[,1] [,2]
[1,] 199.6234 123.3766
[2,] 171.1940 105.8060
[3,] 438.1826 270.8174
X2 <- (123 - E11)^2/E11 + (200 - E12)^2/E12 + (158 - E21)^2/E21 + (119 - E22)^2/E22 +
(528 - E31)^2/E31 + (181 - E32)^2/E32
X2
[1] 127.8592
pvalue <- pchisq(X2, 2, lower = FALSE)
pvalue
[1] 1.720826e-28
chisq.test(T1, correct = FALSE)
Pearson's Chi-squared test
data: T1
X-squared = 127.86, df = 2, p-value < 2.2e-16
# Easier fashion to get expected
chisq.test(T1)$exp
survived
pclass Dead Alive
1st 199.6234 123.3766
2nd 171.1940 105.8060
3rd 438.1826 270.8174
for()
loopB <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
PT <- xtabs(~pclass + sample(survived), data = TITANIC3)
X2[i] <- chisq.test(PT, correct = FALSE)$stat
}
chi_obs_stat <- chisq.test(xtabs(~pclass + survived, data = TITANIC3))$stat
chi_obs_stat
X-squared
127.8592
# OR
chi_obs_stat <- chisq.test(table(TITANIC3$pclass, TITANIC3$survived))$stat
chi_obs_stat
X-squared
127.8592
#
pvalueSIM <- (sum(X2 >= chi_obs_stat) + 1)/(B + 1)
pvalueSIM
[1] 1e-04
What does the permutation distribution under the null hypothesis look like?
ggplot(data = data.frame(x = X2), aes(x = x)) +
geom_density(fill = "pink") +
theme_bw()
infer
null <- TITANIC3 %>%
specify(pclass ~ survived) %>%
hypothesize(null = "independence") %>%
generate(reps = 10^3 - 1, type = "permute") %>%
calculate(stat = "Chisq")
ggplot(data = null, aes(x = stat)) +
geom_density(fill = "skyblue") +
theme_bw()
visualize(null, method = "both", dens_color = "blue") +
theme_bw()
null %>%
summarize(pvalue = (sum(stat >= chi_obs_stat) + 1)/(10^3))
# A tibble: 1 × 1
pvalue
<dbl>
1 0.001
Note problem below
get_p_value(null, obs_stat = chi_obs_stat, direction = "right")
# A tibble: 1 × 1
p_value
<dbl>
1 0
TITANIC3 %>%
select(sex, survived) %>%
table() %>%
addmargins()
survived
sex Dead Alive Sum
female 127 339 466
male 682 161 843
Sum 809 500 1309
TITANIC3 %>%
select(sex, survived) %>%
table() %>%
prop.table(1)
survived
sex Dead Alive
female 0.2725322 0.7274678
male 0.8090154 0.1909846
Consider testing: \(H_0:p_{\text{Male Alive}} - p_{\text{Female Alive}} = 0\) versus \(H_A:p_{\text{Male Alive}} - p_{\text{Female Alive}} < 0\)
prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)
2-sample test for equality of proportions without continuity correction
data: c(161, 339) out of c(843, 466)
X-squared = 365.89, df = 1, p-value < 2.2e-16
alternative hypothesis: less
95 percent confidence interval:
-1.0000000 -0.4959006
sample estimates:
prop 1 prop 2
0.1909846 0.7274678
obs_stat <- prop.test(x = c(161, 339), n = c(843, 466), alternative = "less", correct = FALSE)$stat^.5
obs_stat
X-squared
19.12817
# Same as
phatmale <- 161/843
phatfemale <- 339/466
phatp <- (161 + 339)/(843 + 466)
zobs <- (phatmale - phatfemale) / sqrt( phatp*(1 - phatp)/843 + phatp*(1 - phatp)/466)
zobs
[1] -19.12817
pvalue <- pnorm(zobs)
pvalue
[1] 7.357266e-82
Note: This can also be tested using the \(\chi^2\)-test of homogeneity.
\(H_0: p_{\text{Male Alive}} = p_{\text{Female Alive}} = p_{\text{Male Dead}} = p_{\text{Female Dead}}\) versus at least some \(p_{ij} \neq p_{i+1, j}\) for some \((i,j)\).
T1 <- TITANIC3 %>%
select(sex, survived) %>%
table()
chisq.test(T1, correct = FALSE)
Pearson's Chi-squared test
data: T1
X-squared = 365.89, df = 1, p-value < 2.2e-16
chisq.test(T1, correct = FALSE)$stat^.5
X-squared
19.12817
null <- TITANIC3 %>%
specify(survived ~ sex, success = "Alive") %>%
hypothesize(null = "independence") %>%
generate(reps = 10^3 - 1, type = "permute") %>%
calculate(stat = "Chisq", order = c("male", "female"))
ggplot(data = null, aes(x = stat)) +
geom_density(fill = "pink") +
theme_bw()
null %>%
summarize( (sum(stat >= obs_stat) + 1)/10^3)
# A tibble: 1 × 1
`(sum(stat >= obs_stat) + 1)/10^3`
<dbl>
1 0.001
B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
T3 <- xtabs(~sex + sample(survived), data = TITANIC3)
X2[i] <- chisq.test(T3, correct = FALSE)$stat
}
pvalue <- (sum(X2 >= obs_stat) + 1)/10^4
pvalue
[1] 1e-04
Data will often come summarized in contingency tables.
DP <- c(67, 76, 57, 48, 73, 79)
MDP <- matrix(data = DP, nrow = 2, byrow = TRUE)
dimnames(MDP) <- list(Pop = c("Drug", "Placebo"), Status = c("Improve", "No Change", "Worse"))
TDP <- as.table(MDP)
TDP
Status
Pop Improve No Change Worse
Drug 67 76 57
Placebo 48 73 79
The problem then becomes one of putting the data back into a tidy
format.
NT <- TDP %>%
tibble::as_tibble() %>%
uncount(n)
NT
# A tibble: 400 × 2
Pop Status
<chr> <chr>
1 Drug Improve
2 Drug Improve
3 Drug Improve
4 Drug Improve
5 Drug Improve
6 Drug Improve
7 Drug Improve
8 Drug Improve
9 Drug Improve
10 Drug Improve
# ℹ 390 more rows
ggplot(data = NT, aes(x = Pop, fill = Status)) +
geom_bar(position = "fill") +
theme_bw()
T1 <- NT %>%
table()
T1
Status
Pop Improve No Change Worse
Drug 67 76 57
Placebo 48 73 79
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 6.7584, df = 2, p-value = 0.03408
(chi_obs <- chisq.test(T1)$stat)
X-squared
6.758357
E <- chisq.test(T1)$exp
E
Status
Pop Improve No Change Worse
Drug 57.5 74.5 68
Placebo 57.5 74.5 68
(X2 <- sum((T1 - E)^2/E))
[1] 6.758357
(pvalue <- pchisq(X2, 2, lower = FALSE))
[1] 0.03407544
B <- 10^3 - 1
X2 <- numeric(B)
for(i in 1:B){
TN <- xtabs(~Pop + sample(Status), data = NT)
X2[i] <- chisq.test(TN)$stat
}
null_hom <- data.frame(stat = X2)
ggplot(data = null_hom, aes(x = stat)) +
geom_density(fill = "purple", alpha = 0.3) +
theme_bw() +
stat_function(fun = dchisq, args = list(df = 2), color = "blue")
(pval <- (sum(null_hom$stat >= chi_obs) + 1)/(B + 1))
[1] 0.029
infer
null_hom_infer <- NT %>%
specify(Pop ~ Status) %>%
hypothesize(null = "independence") %>%
generate(reps = 10^3 - 1, type = "permute") %>%
calculate(stat = "Chisq")
visualize(null_hom_infer, method = "both", dens_color = "blue") +
theme_bw()
(pval2 <- (sum(null_hom_infer$stat >= chi_obs) + 1)/(B + 1))
[1] 0.035
Given another table (from summarized information):
BD <- c(23, 20, 18, 23, 20, 19, 18, 21, 19, 22, 24, 29)
Sign = c("Aries", "Taurus", "Gemini", "Cancer", "Leo",
"Virgo", "Libra", "Scorpio", "Sagitarius",
"Capricorn", "Aquarius", "Pisces")
VEC <- rep(Sign, times = BD)
(T2 <- table(VEC))
VEC
Aquarius Aries Cancer Capricorn Gemini Leo Libra
24 23 23 22 18 20 18
Pisces Sagitarius Scorpio Taurus Virgo
29 19 21 20 19
BNT <- T2 %>%
tibble::as_tibble() %>%
uncount(n)
BNT
# A tibble: 256 × 1
VEC
<chr>
1 Aquarius
2 Aquarius
3 Aquarius
4 Aquarius
5 Aquarius
6 Aquarius
7 Aquarius
8 Aquarius
9 Aquarius
10 Aquarius
# ℹ 246 more rows
ggplot(data = BNT, aes(x = VEC)) +
geom_bar(fill = rainbow(12)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Astrological Sign")
chisq.test(T2, p = rep(1/12, 12))
Chi-squared test for given probabilities
data: T2
X-squared = 5.0938, df = 11, p-value = 0.9265
(pv <- chisq.test(T2, p = rep(1/12, 12))$p.value)
[1] 0.9265414
There is little evidence (p-value = 0.9265414) to suggest a nonuniform distribution of zodiac signs among executives.
TH <- c(17, 35, 8, 53, 22, 491)
MTH <- matrix(data = TH, nrow = 3, byrow = TRUE)
dimnames(MTH) <- list(Tatoo = c("Parlor", "Elsewhere", "None"), HepC = c("Yes", "No"))
MTH <- as.table(MTH)
MTH
HepC
Tatoo Yes No
Parlor 17 35
Elsewhere 8 53
None 22 491
NHT <- MTH %>%
tibble::as_tibble() %>%
uncount(n)
NHT
# A tibble: 626 × 2
Tatoo HepC
<chr> <chr>
1 Parlor Yes
2 Parlor Yes
3 Parlor Yes
4 Parlor Yes
5 Parlor Yes
6 Parlor Yes
7 Parlor Yes
8 Parlor Yes
9 Parlor Yes
10 Parlor Yes
# ℹ 616 more rows
ggplot(data = NHT, aes( x = HepC, fill = Tatoo)) +
geom_bar(position = "fill") +
theme_bw() +
scale_fill_manual(values = c("pink", "green", "red"))
(T1 <- xtabs(~HepC + Tatoo,data = NHT))
Tatoo
HepC Elsewhere None Parlor
No 53 491 35
Yes 8 22 17
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 57.912, df = 2, p-value = 2.658e-13
(obs_stat <- chisq.test(T1)$stat)
X-squared
57.91217
B <- 10^4 - 1
X2 <- numeric(B)
for(i in 1:B){
TT <- xtabs(~HepC + sample(Tatoo), data = NHT)
X2[i] <- chisq.test(TT)$stat
}
(pval <- (sum(X2 >= obs_stat) + 1)/(10^4 -1 + 1))
[1] 1e-04