Loading the Data
library(readxl)
library(dplyr)
library(ggplot2)
DF <- read_excel("../Data/TMP.xlsx")
head(DF)
# A tibble: 6 × 6
Id Age_Cohort Age Gender Expenditures Ethnicity
<dbl> <chr> <dbl> <chr> <dbl> <chr>
1 10210 13-17 17 Female 2113 White not Hispanic
2 10409 22-50 37 Male 41924 White not Hispanic
3 10486 0 - 5 3 Male 1454 Hispanic
4 10538 18-21 19 Female 6400 Hispanic
5 10568 13-17 13 Male 4412 White not Hispanic
6 10690 13-17 15 Female 4566 Hispanic
DF$Age_Cohort <- gsub(42898, "6-12", DF$Age_Cohort)
DF$Age_Cohort <- gsub("0 - 5", "0-5", DF$Age_Cohort)
DF$Age_Cohort <- factor(DF$Age_Cohort, levels = c("0-5","6-12","13-17","18-21","22-50","51 +"))
table(DF$Age_Cohort)
0-5 6-12 13-17 18-21 22-50 51 +
82 175 212 199 226 106
DT::datatable(DF)
write.csv(DF, file = "../Data/discrim.csv")
Together
DF %>%
group_by(Gender) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n(), SD = sd(Expenditures))
# A tibble: 2 × 5
Gender ME MDE n SD
<chr> <dbl> <dbl> <int> <dbl>
1 Female 18130. 6400 503 20020.
2 Male 18001. 7219 497 19068.
ggplot(data = DF, aes(x = Gender, y = Expenditures, fill = Gender)) +
geom_boxplot() +
theme_bw() +
scale_fill_manual(values = c("pink", "blue"))
DF %>%
group_by(Gender) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n()) %>%
ggplot(aes(x = Gender, y= ME, fill = Gender)) +
geom_bar(stat = "identity") +
labs(title = "Average Expenditure by Gender", y = "Mean Expenditure") +
theme_bw() +
scale_fill_manual(values = c("pink", "blue"))
DF %>%
group_by(Ethnicity) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 8 × 4
Ethnicity ME MDE n
<chr> <dbl> <dbl> <int>
1 American Indian 36438. 41818. 4
2 Asian 18392. 9369 129
3 Black 20885. 8687 59
4 Hispanic 11066. 3952 376
5 Multi Race 4457. 2622 26
6 Native Hawaiian 42782. 40727 3
7 Other 3316. 3316. 2
8 White not Hispanic 24698. 15718 401
DF %>%
group_by(Age_Cohort) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 6 × 4
Age_Cohort ME MDE n
<fct> <dbl> <dbl> <int>
1 0-5 1415. 1380. 82
2 6-12 2227. 2191 175
3 13-17 3923. 3952 212
4 18-21 9889. 9979 199
5 22-50 40209. 40456. 226
6 51 + 53522. 53509 106
DF %>%
group_by(Ethnicity) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n()) %>%
ggplot(aes(x = reorder(Ethnicity, ME), y = ME)) +
geom_bar(stat="identity", fill = "red") +
theme_bw() +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
labs(x = "", y = "Mean Expenditure", title = "Average Expenditure by Ethnicity")
ggplot(data = DF, aes(x = reorder(Ethnicity, Expenditures, median), y = Expenditures)) +
geom_boxplot() +
theme_bw() +
theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
labs(x = "")
Typical expenditure for 22-50 year old
DF %>%
group_by(Age_Cohort) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 6 × 4
Age_Cohort ME MDE n
<fct> <dbl> <dbl> <int>
1 0-5 1415. 1380. 82
2 6-12 2227. 2191 175
3 13-17 3923. 3952 212
4 18-21 9889. 9979 199
5 22-50 40209. 40456. 226
6 51 + 53522. 53509 106
ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot() + theme_bw()
ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot() + theme_bw() +
facet_grid(.~Gender)
ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot(varwidth = TRUE) + theme_bw() +
facet_grid(Ethnicity ~ Gender)
DF %>%
group_by(Gender, Ethnicity) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 16 × 5
# Groups: Gender [2]
Gender Ethnicity ME MDE n
<chr> <chr> <dbl> <dbl> <int>
1 Female American Indian 39183. 55430 3
2 Female Asian 20008. 9200 61
3 Female Black 18775. 5254. 26
4 Female Hispanic 10786. 3923 192
5 Female Multi Race 3227. 2335 13
6 Female Native Hawaiian 45434 45434 2
7 Female Other 2018 2018 1
8 Female White not Hispanic 24816. 12882 205
9 Male American Indian 28205 28205 1
10 Male Asian 16943. 9608. 68
11 Male Black 22547. 11327 33
12 Male Hispanic 11357. 4002. 184
13 Male Multi Race 5687. 3000 13
14 Male Native Hawaiian 37479 37479 1
15 Male Other 4615 4615 1
16 Male White not Hispanic 24574. 27390. 196
DF %>%
group_by(Ethnicity, Age_Cohort) %>%
summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 35 × 5
# Groups: Ethnicity [8]
Ethnicity Age_Cohort ME MDE n
<chr> <fct> <dbl> <dbl> <int>
1 American Indian 13-17 3726 3726 1
2 American Indian 22-50 28205 28205 1
3 American Indian 51 + 56911 56911 2
4 Asian 0-5 1502. 1388 8
5 Asian 6-12 2165. 2001 18
6 Asian 13-17 3509. 3628. 20
7 Asian 18-21 9598. 9846 41
8 Asian 22-50 39581. 40240 29
9 Asian 51 + 54623. 54481 13
10 Black 0-5 1083 1398 3
# … with 25 more rows
Just consider White and Hispanic
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n())
# A tibble: 2 × 3
Ethnicity ME n
<chr> <dbl> <int>
1 Hispanic 11066. 376
2 White not Hispanic 24698. 401
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n()) %>%
ggplot(aes(x = Ethnicity, y = ME, fill = Ethnicity)) +
geom_bar(stat = "identity") +
theme_bw() +
scale_fill_manual(values = c("chocolate", "peachpuff")) +
labs(y = "Mean Expenditure")
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
ggplot(aes(x = Ethnicity, y = Expenditures, fill = Ethnicity)) +
geom_boxplot() +
theme_bw() +
scale_fill_manual(values = c("chocolate", "peachpuff"))
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Age_Cohort, Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n())
# A tibble: 12 × 4
# Groups: Age_Cohort [6]
Age_Cohort Ethnicity ME n
<fct> <chr> <dbl> <int>
1 0-5 Hispanic 1393. 44
2 0-5 White not Hispanic 1367. 20
3 6-12 Hispanic 2312. 91
4 6-12 White not Hispanic 2052. 46
5 13-17 Hispanic 3955. 103
6 13-17 White not Hispanic 3904. 67
7 18-21 Hispanic 9960. 78
8 18-21 White not Hispanic 10133. 69
9 22-50 Hispanic 40924. 43
10 22-50 White not Hispanic 40188. 133
11 51 + Hispanic 55585 17
12 51 + White not Hispanic 52670. 66
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Age_Cohort, Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n()) %>%
ggplot(aes(x = reorder(Age_Cohort, ME), y = ME, fill = Ethnicity)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
labs(x = "Age Cohort", y = "Average Expenditure",
title = "Average Expenditures by Age Cohort and Ethnicity") +
scale_fill_manual(values = c("chocolate", "peachpuff"))
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
ggplot(aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures, fill = Ethnicity)) +
geom_boxplot(position = position_dodge(0.9)) +
theme_bw() +
scale_fill_manual(values = c("chocolate", "peachpuff"))
Some Explanation
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Age_Cohort, Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n())
# A tibble: 12 × 4
# Groups: Age_Cohort [6]
Age_Cohort Ethnicity ME n
<fct> <chr> <dbl> <int>
1 0-5 Hispanic 1393. 44
2 0-5 White not Hispanic 1367. 20
3 6-12 Hispanic 2312. 91
4 6-12 White not Hispanic 2052. 46
5 13-17 Hispanic 3955. 103
6 13-17 White not Hispanic 3904. 67
7 18-21 Hispanic 9960. 78
8 18-21 White not Hispanic 10133. 69
9 22-50 Hispanic 40924. 43
10 22-50 White not Hispanic 40188. 133
11 51 + Hispanic 55585 17
12 51 + White not Hispanic 52670. 66
DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
group_by(Age_Cohort, Ethnicity) %>%
summarize(ME = mean(Expenditures), n = n()) %>%
ggplot(aes(x = reorder(Age_Cohort, ME), y = n, fill = Ethnicity)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
scale_fill_manual(values = c("chocolate", "peachpuff")) +
labs(x = "Age Cohort", y = "Number in Group", title = "Consumers by Ethnicity and Age Cohort")
Age Expenditures Relationship?
ggplot(data = subset(DF,Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic" ), aes(x = Age, y = Expenditures, color = Ethnicity)) +
geom_point(alpha = 0.8) +
geom_smooth(se = FALSE) +
theme_bw() +
scale_color_manual(values = c("chocolate", "peachpuff"))
Tests?
NDF <- DF %>%
filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
select(Ethnicity, Expenditures)
NDF %>%
group_by(Ethnicity) %>%
summarize(AVG = mean(Expenditures))
# A tibble: 2 × 2
Ethnicity AVG
<chr> <dbl>
1 Hispanic 11066.
2 White not Hispanic 24698.
DT::datatable(NDF)
White <- NDF$Expenditures[NDF$Ethnicity=="White not Hispanic"]
mean(White)
[1] 24697.55
Hispanic <- NDF$Expenditures[NDF$Ethnicity=="Hispanic"]
mean(Hispanic)
[1] 11065.57
nW <- length(White)
nH <- length(Hispanic)
BV <- c(White, Hispanic)
obs_diff <- mean(White) - mean(Hispanic)
obs_diff
[1] 13631.98
sims <- 10^4 - 1
ans <- numeric(sims)
for(i in 1:sims){
index <- sample(nW + nH, nW)
ans[i] <- mean(BV[index]) - mean(BV[-index])
}
hist(ans, xlim = c(-obs_diff-10, obs_diff+10))
abline(v = obs_diff)
#
ggplot(data = data.frame(x = ans), aes(x = x)) +
geom_histogram(binwidth = 500, fill = "pink", color = "black") +
theme_bw() +
geom_vline(xintercept = obs_diff)
pvalue <- (sum(ans >= obs_diff) + 1)/(sims + 1)
pvalue
[1] 1e-04
model <- lm(Expenditures ~ Age_Cohort * Ethnicity,
data = subset(DF, Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic"))
summary(model)
Call:
lm(formula = Expenditures ~ Age_Cohort * Ethnicity, data = subset(DF,
Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic"))
Residuals:
Min 1Q Median 3Q Max
-19560.4 -1177.6 -0.4 1163.6 16528.4
Coefficients:
Estimate Std. Error t value
(Intercept) 1393.20 581.15 2.397
Age_Cohort6-12 918.98 707.84 1.298
Age_Cohort13-17 2562.08 694.27 3.690
Age_Cohort18-21 8566.64 726.81 11.787
Age_Cohort22-50 39530.91 826.64 47.821
Age_Cohort51 + 54191.80 1100.86 49.227
EthnicityWhite not Hispanic -26.30 1039.60 -0.025
Age_Cohort6-12:EthnicityWhite not Hispanic -233.62 1251.84 -0.187
Age_Cohort13-17:EthnicityWhite not Hispanic -24.62 1202.84 -0.020
Age_Cohort18-21:EthnicityWhite not Hispanic 199.52 1219.28 0.164
Age_Cohort22-50:EthnicityWhite not Hispanic -710.19 1240.20 -0.573
Age_Cohort51 +:EthnicityWhite not Hispanic -2888.27 1476.50 -1.956
Pr(>|t|)
(Intercept) 0.01675 *
Age_Cohort6-12 0.19458
Age_Cohort13-17 0.00024 ***
Age_Cohort18-21 < 2e-16 ***
Age_Cohort22-50 < 2e-16 ***
Age_Cohort51 + < 2e-16 ***
EthnicityWhite not Hispanic 0.97982
Age_Cohort6-12:EthnicityWhite not Hispanic 0.85201
Age_Cohort13-17:EthnicityWhite not Hispanic 0.98368
Age_Cohort18-21:EthnicityWhite not Hispanic 0.87006
Age_Cohort22-50:EthnicityWhite not Hispanic 0.56706
Age_Cohort51 +:EthnicityWhite not Hispanic 0.05081 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3855 on 765 degrees of freedom
Multiple R-squared: 0.9618, Adjusted R-squared: 0.9612
F-statistic: 1750 on 11 and 765 DF, p-value: < 2.2e-16