4. multicollinearity

4. multicollinearity#

suppressPackageStartupMessages({
  library(tidyverse)
  library(tidymodels)
  library(readxl)
  library(multcomp)
  library(car)
})
df <- read_excel("data/Table4_2.xls")
head(df,3)
A tibble: 3 × 3
expendincomewealth
<dbl><dbl><dbl>
70 80 810
651001009
901201273
model <- lm(expend ~ income + wealth, data = df)
tidy(model)
glance(model)$r.squared
A tibble: 3 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept)24.774733276.75249960 3.66897220.007975077
income 0.941537340.82289826 1.14417220.290164748
wealth -0.042434530.08066448-0.52606210.615094539
0.963504395243514

none of the slope coeffi cients is statistically signifi cant, as the t values are statistically insignificant. Yet the R2 value is very high.

model2 <- lm(expend ~ income, data = df)
tidy(model2)
glance(model2)$r.squared
A tibble: 2 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept)24.45454556.41381730 3.8127915.142172e-03
income 0.50909090.0357428114.2431715.752746e-07
0.962061560486757

income alone has significant impact on expenditure,

model3 <- lm(expend ~ wealth, data = df)
tidy(model3)
glance(model3)$r.squared
A tibble: 2 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept)24.411044856.874096840 3.5511647.496699e-03
wealth 0.049763770.00374398613.2916569.802080e-07
0.956679038871206

wealth alone has a significant impact on expenditure

model4 <- lm(wealth ~ income, data = df)
tidy(model4)
glance(model4)$r.squared
A tibble: 2 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept) 7.54545529.4758107 0.2559888.044195e-01
income 10.190909 0.164262362.0404745.064901e-12
0.997925860058925

wealth and income are highly related

Example#

df2 <- read_excel("data/Table4_4.xls")
head(df2,3)
A tibble: 3 × 25
taxableincfederaltaxhsiblingshfathereduchmothereducsiblingslfphourskidsl6kids618hageheduchwagefamincmtrmothereducfathereducunemploymentlargecityexper
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
122001494114164116101034124.0288163100.7215127 5014
1800026158 7 30116560230 98.4416218000.6615 77111 5
2400039574 7102119801340123.5807210400.6915127 5015

Hours: hours worked in 1975 (dependent variable)
Kidslt6: number of kids under age 6
Kidsge6: number of kids between ages 6 and 18
Age: woman’s age in years
Educ: years of schooling
Wage: estimated wage from earnings
Hushrs: hours worked by husband
Husage: husband’s age
Huseduc: husband’s years of schooling
Huswage: husband’s hourly wage, 1975
Faminc: family income in 1975
Mtr: federal marginal tax rate facing a woman
motheduc: mother’s years of schooling
fatheduc: father’s years of schooling
Unem: unemployment rate in county of residence
exper: actual labor market experience

assess the impact of several socio-economic variables on married women’s hours of work in the labor market. This is cross-sectional data on 753 married women for the year 1975. It should be noted that there were 325 married women who did not work and hence had zero hours of work.

model5 <- lm(hours ~ age + educ + exper + faminc + fathereduc + 
            hage + heduc + hhours + hwage + kidsl6 + kids618 +
            wage + mothereduc + mtr + unemployment, data = df2)

tidy(model5)
glance(model5)
A tibble: 16 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept) 5805.47230706.843439e+02 8.48326711.194368e-16
age -24.00441106.859174e+00-3.49960694.939481e-04
educ -13.77605781.528931e+01-0.90102573.678690e-01
exper 34.95387993.456319e+0010.11303771.331411e-22
faminc 0.01566834.641855e-03 3.37543947.755884e-04
fathereduc -4.82067718.673176e+00-0.55581455.785063e-01
hage -5.02822986.669922e+00-0.75386644.511702e-01
heduc -11.69524851.101195e+01-1.06205042.885607e-01
hhours -0.33890675.107115e-02-6.63597146.246245e-11
hwage -107.33735431.091622e+01-9.83282911.600070e-21
kidsl6 -322.71688695.345385e+01-6.03729932.482936e-09
kids618 -4.04220052.149471e+01-0.18805568.508849e-01
wage 51.67676048.714492e+00 5.92997994.655600e-09
mothereduc 9.84221519.195859e+00 1.07028782.848402e-01
mtr -3956.76862737.215385e+02-5.48379415.723755e-08
unemployment -7.68658047.979074e+00-0.96334243.356917e-01
A tibble: 1 × 12
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int><int>
0.4343070.4227936661.973237.721798.317512e-8115-5951.27911936.5612015.17322959634737753
colnames(df2)
  1. 'taxableinc'
  2. 'federaltax'
  3. 'hsiblings'
  4. 'hfathereduc'
  5. 'hmothereduc'
  6. 'siblings'
  7. 'lfp'
  8. 'hours'
  9. 'kidsl6'
  10. 'kids618'
  11. 'age'
  12. 'educ'
  13. 'wage'
  14. 'wage76'
  15. 'hhours'
  16. 'hage'
  17. 'heduc'
  18. 'hwage'
  19. 'faminc'
  20. 'mtr'
  21. 'mothereduc'
  22. 'fathereduc'
  23. 'unemployment'
  24. 'largecity'
  25. 'exper'
df2 %>%
  summarize(correlation = cor(hage, faminc, use = "complete.obs"))
A tibble: 1 × 1
correlation
<dbl>
0.04050263
correlation_matrix <- cor(df2, use = "complete.obs")

heatmap(correlation_matrix, 
        col = colorRampPalette(c("blue", "white", "red"))(100),
        symm = TRUE,         
        margins = c(8, 8),   
        main = "Correlation Heatmap of df2")
_images/932da9dbc042ccb4bf75a5454ee66fb16ac26706dbd2f08ed6c4c57ff4779043.png
vif(model5)
age
5.26144147726373
educ
2.0858157402719
exper
1.33480562987452
faminc
5.49467416520725
fathereduc
1.64735313494315
hage
4.95813811611547
heduc
1.89893838127637
hhours
1.58762941970657
hwage
3.65996838421493
kidsl6
1.34613841992449
kids618
1.3812278886656
wage
1.36962420005194
mothereduc
1.6456166790573
mtr
6.22848954766369
unemployment
1.06008242126237
model6 <- lm(hours ~ age + educ + exper + faminc + hhours + hwage + kidsl6  +
            wage  + mtr + unemployment, data = df2)

tidy(model6)
glance(model6)
A tibble: 11 × 5
termestimatestd.errorstatisticp.value
<chr><dbl><dbl><dbl><dbl>
(Intercept) 5.720767e+036.622272e+02 8.63867703.464404e-17
age -2.823358e+013.566101e+00 -7.91721488.858432e-15
educ -1.828395e+011.223935e+01 -1.49386581.356358e-01
exper 3.511494e+013.394298e+00 10.34527221.595033e-23
faminc 1.592231e-024.509078e-03 3.53116794.392850e-04
hhours -3.461675e-015.018601e-02 -6.89769001.132866e-11
hwage -1.100438e+021.066498e+01-10.31824032.040707e-23
kidsl6 -3.193502e+025.231116e+01 -6.10482031.658396e-09
wage 5.188557e+018.657064e+00 5.99343703.204685e-09
mtr -3.929831e+036.850461e+02 -5.73659361.406737e-08
unemployment-7.721889e+007.940710e+00 -0.97244323.311470e-01
A tibble: 1 × 12
r.squaredadj.r.squaredsigmastatisticp.valuedflogLikAICBICdeviancedf.residualnobs
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><int><int>
0.4321220.4244687661.011956.461871.932607e-8410-5952.7311929.4611984.95324207062742753
vif(model6)
age
1.42629876639497
educ
1.34053961380487
exper
1.29107879991459
faminc
5.19991905936222
hhours
1.53753610679107
hwage
3.50360087467717
kidsl6
1.29295288892452
wage
1.35556617939112
mtr
5.63073994873757
unemployment
1.05296872093097

Principal components analysis#

Results are different from the book

scaled_data <- scale(df2)
pca_result <- prcomp(scaled_data, center = TRUE, scale = TRUE)
pca_components <- pca_result$rotation

# Extract standardized scores (principal component scores)
pca_scores <- pca_result$x

# Extract standard deviations of principal components
pca_std_dev <- pca_result$sdev
variance_explained <- pca_std_dev^2 / sum(pca_std_dev^2)
screeplot(pca_result, type = "lines", main = "Scree Plot")
_images/25e52c20db9ea474a2e55fe4197dd2f1f8b51ef907f6e093d750864699fc8968.png
summary(pca_result)
Importance of components:
                         PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     2.019 1.7615 1.6080 1.56381 1.18734 1.16346 1.13759
Proportion of Variance 0.163 0.1241 0.1034 0.09782 0.05639 0.05415 0.05176
Cumulative Proportion  0.163 0.2871 0.3906 0.48837 0.54476 0.59891 0.65067
                           PC8     PC9    PC10    PC11    PC12    PC13    PC14
Standard deviation     0.98662 0.93992 0.93027 0.88700 0.85488 0.83909 0.79942
Proportion of Variance 0.03894 0.03534 0.03462 0.03147 0.02923 0.02816 0.02556
Cumulative Proportion  0.68961 0.72495 0.75956 0.79103 0.82027 0.84843 0.87399
                          PC15    PC16    PC17   PC18    PC19    PC20    PC21
Standard deviation     0.77286 0.71328 0.65361 0.6285 0.58349 0.56742 0.44504
Proportion of Variance 0.02389 0.02035 0.01709 0.0158 0.01362 0.01288 0.00792
Cumulative Proportion  0.89788 0.91824 0.93532 0.9511 0.96474 0.97762 0.98554
                          PC22    PC23    PC24    PC25
Standard deviation     0.35882 0.32453 0.31267 0.17204
Proportion of Variance 0.00515 0.00421 0.00391 0.00118
Cumulative Proportion  0.99069 0.99491 0.99882 1.00000

Exercises#

🚧 Under Construction