Loading the Data

library(readxl)
library(dplyr)
library(ggplot2)
DF <- read_excel("../Data/TMP.xlsx")
head(DF)
# A tibble: 6 × 6
     Id Age_Cohort   Age Gender Expenditures Ethnicity         
  <dbl> <chr>      <dbl> <chr>         <dbl> <chr>             
1 10210 13-17         17 Female         2113 White not Hispanic
2 10409 22-50         37 Male          41924 White not Hispanic
3 10486 0 - 5          3 Male           1454 Hispanic          
4 10538 18-21         19 Female         6400 Hispanic          
5 10568 13-17         13 Male           4412 White not Hispanic
6 10690 13-17         15 Female         4566 Hispanic          
DF$Age_Cohort <- gsub(42898, "6-12", DF$Age_Cohort)
DF$Age_Cohort <- gsub("0 - 5", "0-5", DF$Age_Cohort)
DF$Age_Cohort <- factor(DF$Age_Cohort, levels = c("0-5","6-12","13-17","18-21","22-50","51 +"))
table(DF$Age_Cohort)

  0-5  6-12 13-17 18-21 22-50  51 + 
   82   175   212   199   226   106 
DT::datatable(DF)
write.csv(DF, file = "../Data/discrim.csv")

Together

DF %>% 
  group_by(Gender) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n(), SD = sd(Expenditures))
# A tibble: 2 × 5
  Gender     ME   MDE     n     SD
  <chr>   <dbl> <dbl> <int>  <dbl>
1 Female 18130.  6400   503 20020.
2 Male   18001.  7219   497 19068.
ggplot(data = DF, aes(x = Gender, y = Expenditures, fill = Gender)) + 
  geom_boxplot() + 
  theme_bw() + 
  scale_fill_manual(values = c("pink", "blue"))

DF %>% 
  group_by(Gender) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n()) %>%
  ggplot(aes(x = Gender, y= ME, fill = Gender)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Average Expenditure by Gender", y = "Mean Expenditure") + 
  theme_bw() + 
  scale_fill_manual(values = c("pink", "blue"))

DF %>% 
  group_by(Ethnicity) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 8 × 4
  Ethnicity              ME    MDE     n
  <chr>               <dbl>  <dbl> <int>
1 American Indian    36438. 41818.     4
2 Asian              18392.  9369    129
3 Black              20885.  8687     59
4 Hispanic           11066.  3952    376
5 Multi Race          4457.  2622     26
6 Native Hawaiian    42782. 40727      3
7 Other               3316.  3316.     2
8 White not Hispanic 24698. 15718    401
DF %>% 
  group_by(Age_Cohort) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 6 × 4
  Age_Cohort     ME    MDE     n
  <fct>       <dbl>  <dbl> <int>
1 0-5         1415.  1380.    82
2 6-12        2227.  2191    175
3 13-17       3923.  3952    212
4 18-21       9889.  9979    199
5 22-50      40209. 40456.   226
6 51 +       53522. 53509    106
DF %>% 
  group_by(Ethnicity) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n()) %>%  
  ggplot(aes(x = reorder(Ethnicity, ME), y = ME)) +
  geom_bar(stat="identity", fill = "red") + 
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 50, hjust = 1)) + 
  labs(x = "", y = "Mean Expenditure", title = "Average Expenditure by Ethnicity")

ggplot(data = DF, aes(x = reorder(Ethnicity, Expenditures, median), y = Expenditures)) + 
  geom_boxplot() +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 50, hjust = 1)) +
  labs(x = "")

Typical expenditure for 22-50 year old

DF %>% 
  group_by(Age_Cohort) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 6 × 4
  Age_Cohort     ME    MDE     n
  <fct>       <dbl>  <dbl> <int>
1 0-5         1415.  1380.    82
2 6-12        2227.  2191    175
3 13-17       3923.  3952    212
4 18-21       9889.  9979    199
5 22-50      40209. 40456.   226
6 51 +       53522. 53509    106
ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot() + theme_bw()

ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot() + theme_bw() + 
  facet_grid(.~Gender)

ggplot(data = DF, aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures)) + geom_boxplot(varwidth = TRUE) + theme_bw() + 
  facet_grid(Ethnicity ~ Gender)

DF %>% 
  group_by(Gender, Ethnicity) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 16 × 5
# Groups:   Gender [2]
   Gender Ethnicity              ME    MDE     n
   <chr>  <chr>               <dbl>  <dbl> <int>
 1 Female American Indian    39183. 55430      3
 2 Female Asian              20008.  9200     61
 3 Female Black              18775.  5254.    26
 4 Female Hispanic           10786.  3923    192
 5 Female Multi Race          3227.  2335     13
 6 Female Native Hawaiian    45434  45434      2
 7 Female Other               2018   2018      1
 8 Female White not Hispanic 24816. 12882    205
 9 Male   American Indian    28205  28205      1
10 Male   Asian              16943.  9608.    68
11 Male   Black              22547. 11327     33
12 Male   Hispanic           11357.  4002.   184
13 Male   Multi Race          5687.  3000     13
14 Male   Native Hawaiian    37479  37479      1
15 Male   Other               4615   4615      1
16 Male   White not Hispanic 24574. 27390.   196
DF %>% 
  group_by(Ethnicity, Age_Cohort) %>% 
  summarize(ME = mean(Expenditures), MDE = median(Expenditures), n= n())
# A tibble: 35 × 5
# Groups:   Ethnicity [8]
   Ethnicity       Age_Cohort     ME    MDE     n
   <chr>           <fct>       <dbl>  <dbl> <int>
 1 American Indian 13-17       3726   3726      1
 2 American Indian 22-50      28205  28205      1
 3 American Indian 51 +       56911  56911      2
 4 Asian           0-5         1502.  1388      8
 5 Asian           6-12        2165.  2001     18
 6 Asian           13-17       3509.  3628.    20
 7 Asian           18-21       9598.  9846     41
 8 Asian           22-50      39581. 40240     29
 9 Asian           51 +       54623. 54481     13
10 Black           0-5         1083   1398      3
# … with 25 more rows

Just consider White and Hispanic

DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n())
# A tibble: 2 × 3
  Ethnicity              ME     n
  <chr>               <dbl> <int>
1 Hispanic           11066.   376
2 White not Hispanic 24698.   401
DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n()) %>% 
  ggplot(aes(x = Ethnicity, y = ME, fill = Ethnicity)) + 
  geom_bar(stat = "identity") +
  theme_bw() + 
  scale_fill_manual(values = c("chocolate", "peachpuff")) + 
  labs(y = "Mean Expenditure")

DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
  ggplot(aes(x = Ethnicity, y = Expenditures, fill = Ethnicity)) + 
  geom_boxplot() + 
  theme_bw() + 
  scale_fill_manual(values = c("chocolate", "peachpuff"))

DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Age_Cohort, Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n())
# A tibble: 12 × 4
# Groups:   Age_Cohort [6]
   Age_Cohort Ethnicity              ME     n
   <fct>      <chr>               <dbl> <int>
 1 0-5        Hispanic            1393.    44
 2 0-5        White not Hispanic  1367.    20
 3 6-12       Hispanic            2312.    91
 4 6-12       White not Hispanic  2052.    46
 5 13-17      Hispanic            3955.   103
 6 13-17      White not Hispanic  3904.    67
 7 18-21      Hispanic            9960.    78
 8 18-21      White not Hispanic 10133.    69
 9 22-50      Hispanic           40924.    43
10 22-50      White not Hispanic 40188.   133
11 51 +       Hispanic           55585     17
12 51 +       White not Hispanic 52670.    66
DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Age_Cohort, Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n()) %>%
  ggplot(aes(x = reorder(Age_Cohort, ME), y = ME, fill = Ethnicity)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw() + 
  labs(x = "Age Cohort", y = "Average Expenditure", 
       title = "Average Expenditures by Age Cohort and Ethnicity") + 
  scale_fill_manual(values = c("chocolate", "peachpuff"))

DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>%
  ggplot(aes(x = reorder(Age_Cohort, Expenditures, median), y = Expenditures, fill = Ethnicity)) + 
  geom_boxplot(position = position_dodge(0.9)) + 
  theme_bw() + 
  scale_fill_manual(values = c("chocolate", "peachpuff"))

Some Explanation

DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Age_Cohort, Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n())
# A tibble: 12 × 4
# Groups:   Age_Cohort [6]
   Age_Cohort Ethnicity              ME     n
   <fct>      <chr>               <dbl> <int>
 1 0-5        Hispanic            1393.    44
 2 0-5        White not Hispanic  1367.    20
 3 6-12       Hispanic            2312.    91
 4 6-12       White not Hispanic  2052.    46
 5 13-17      Hispanic            3955.   103
 6 13-17      White not Hispanic  3904.    67
 7 18-21      Hispanic            9960.    78
 8 18-21      White not Hispanic 10133.    69
 9 22-50      Hispanic           40924.    43
10 22-50      White not Hispanic 40188.   133
11 51 +       Hispanic           55585     17
12 51 +       White not Hispanic 52670.    66
DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  group_by(Age_Cohort, Ethnicity) %>% 
  summarize(ME = mean(Expenditures), n = n()) %>%
  ggplot(aes(x = reorder(Age_Cohort, ME), y = n, fill = Ethnicity)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  theme_bw() + 
  scale_fill_manual(values = c("chocolate", "peachpuff")) + 
  labs(x = "Age Cohort", y = "Number in Group", title = "Consumers by Ethnicity and Age Cohort")

Age Expenditures Relationship?

ggplot(data = subset(DF,Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic" ), aes(x = Age, y = Expenditures, color = Ethnicity)) + 
  geom_point(alpha = 0.8) +
  geom_smooth(se = FALSE) + 
  theme_bw() + 
  scale_color_manual(values = c("chocolate", "peachpuff"))

Tests?

NDF <- DF %>%
  filter(Ethnicity %in% c("Hispanic", "White not Hispanic")) %>% 
  select(Ethnicity, Expenditures)
NDF %>% 
  group_by(Ethnicity) %>% 
  summarize(AVG = mean(Expenditures))
# A tibble: 2 × 2
  Ethnicity             AVG
  <chr>               <dbl>
1 Hispanic           11066.
2 White not Hispanic 24698.
DT::datatable(NDF)
White <- NDF$Expenditures[NDF$Ethnicity=="White not Hispanic"]
mean(White)
[1] 24697.55
Hispanic <- NDF$Expenditures[NDF$Ethnicity=="Hispanic"]
mean(Hispanic)
[1] 11065.57
nW <- length(White)
nH <- length(Hispanic)
BV <- c(White, Hispanic)
obs_diff <- mean(White) - mean(Hispanic)
obs_diff
[1] 13631.98
sims <- 10^4 - 1
ans <- numeric(sims)
for(i in 1:sims){
  index <- sample(nW + nH, nW)
  ans[i] <- mean(BV[index]) - mean(BV[-index])
}
hist(ans, xlim = c(-obs_diff-10, obs_diff+10))
abline(v = obs_diff)

#
ggplot(data = data.frame(x = ans), aes(x = x)) + 
  geom_histogram(binwidth = 500, fill = "pink", color = "black") + 
  theme_bw() + 
  geom_vline(xintercept = obs_diff)

pvalue <- (sum(ans >= obs_diff) + 1)/(sims + 1)
pvalue
[1] 1e-04
model <- lm(Expenditures ~ Age_Cohort * Ethnicity, 
            data = subset(DF, Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic"))
summary(model)

Call:
lm(formula = Expenditures ~ Age_Cohort * Ethnicity, data = subset(DF, 
    Ethnicity == "White not Hispanic" | Ethnicity == "Hispanic"))

Residuals:
     Min       1Q   Median       3Q      Max 
-19560.4  -1177.6     -0.4   1163.6  16528.4 

Coefficients:
                                            Estimate Std. Error t value
(Intercept)                                  1393.20     581.15   2.397
Age_Cohort6-12                                918.98     707.84   1.298
Age_Cohort13-17                              2562.08     694.27   3.690
Age_Cohort18-21                              8566.64     726.81  11.787
Age_Cohort22-50                             39530.91     826.64  47.821
Age_Cohort51 +                              54191.80    1100.86  49.227
EthnicityWhite not Hispanic                   -26.30    1039.60  -0.025
Age_Cohort6-12:EthnicityWhite not Hispanic   -233.62    1251.84  -0.187
Age_Cohort13-17:EthnicityWhite not Hispanic   -24.62    1202.84  -0.020
Age_Cohort18-21:EthnicityWhite not Hispanic   199.52    1219.28   0.164
Age_Cohort22-50:EthnicityWhite not Hispanic  -710.19    1240.20  -0.573
Age_Cohort51 +:EthnicityWhite not Hispanic  -2888.27    1476.50  -1.956
                                            Pr(>|t|)    
(Intercept)                                  0.01675 *  
Age_Cohort6-12                               0.19458    
Age_Cohort13-17                              0.00024 ***
Age_Cohort18-21                              < 2e-16 ***
Age_Cohort22-50                              < 2e-16 ***
Age_Cohort51 +                               < 2e-16 ***
EthnicityWhite not Hispanic                  0.97982    
Age_Cohort6-12:EthnicityWhite not Hispanic   0.85201    
Age_Cohort13-17:EthnicityWhite not Hispanic  0.98368    
Age_Cohort18-21:EthnicityWhite not Hispanic  0.87006    
Age_Cohort22-50:EthnicityWhite not Hispanic  0.56706    
Age_Cohort51 +:EthnicityWhite not Hispanic   0.05081 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3855 on 765 degrees of freedom
Multiple R-squared:  0.9618,    Adjusted R-squared:  0.9612 
F-statistic:  1750 on 11 and 765 DF,  p-value: < 2.2e-16