03 Regression (Exercise 15)
Yeon Soo, Choi
15. Boston Data
Boston
데이터는 crim
, zn
, indus
, chas
, nox
, rm
, age
, dis
, rad
, tax
, ptratio
, black
, lstat
, medv
이렇게 14개의 변수들로 이루어져 있으며,
15번 예제의 경우 마을 별 인당 범죄율 에 대한 정보를 갖는 crim
변수를 종속변수 로 사용하여 예측하는 것이 목표다.
library(knitr)
library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
colSums(is.na(Boston))
## crim zn indus chas nox rm age dis rad
## 0 0 0 0 0 0 0 0 0
## tax ptratio black lstat medv
## 0 0 0 0 0
## chas : Charles River dummy variable (=1 bounds river, =0 otherwise)
Boston$chas=factor(Boston$chas,c(0,1),labels=c('N','Y'))
kable(head(Boston))
crim | zn | indus | chas | nox | rm | age | dis | rad | tax | ptratio | black | lstat | medv |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0.00632 | 18 | 2.31 | N | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296 | 15.3 | 396.90 | 4.98 | 24.0 |
0.02731 | 0 | 7.07 | N | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242 | 17.8 | 396.90 | 9.14 | 21.6 |
0.02729 | 0 | 7.07 | N | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242 | 17.8 | 392.83 | 4.03 | 34.7 |
0.03237 | 0 | 2.18 | N | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222 | 18.7 | 394.63 | 2.94 | 33.4 |
0.06905 | 0 | 2.18 | N | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222 | 18.7 | 396.90 | 5.33 | 36.2 |
0.02985 | 0 | 2.18 | N | 0.458 | 6.430 | 58.7 | 6.0622 | 3 | 222 | 18.7 | 394.12 | 5.21 | 28.7 |
(a) 각각의 독립 변수들로 단순 선형 회귀 모형을 적합하고 통계적으로 유의한 모형인지 아닌지 판단하여라.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(corrplot)
## corrplot 0.84 loaded
preds=names(Boston)[c(2:3,5:14)]
## plot response by predictors
featurePlot(x=Boston[,preds],y=Boston$crim,pch=20,cex=1.2)
corrplot(cor(Boston[,preds]),method='square')
ZN : proportion of residential land zoned for lots over 25,000 sq.ft.
## 13 simple linear regression models
attach(Boston)
## yes
lm.zn = lm(crim~zn)
summary(lm.zn)
##
## Call:
## lm(formula = crim ~ zn)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.429 -4.222 -2.620 1.250 84.523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.45369 0.41722 10.675 < 2e-16 ***
## zn -0.07393 0.01609 -4.594 5.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.435 on 504 degrees of freedom
## Multiple R-squared: 0.04019, Adjusted R-squared: 0.03828
## F-statistic: 21.1 on 1 and 504 DF, p-value: 5.506e-06
INDUS : proportion of non-retail business acres per town.
## yes
lm.indus = lm(crim~indus)
summary(lm.indus)
##
## Call:
## lm(formula = crim ~ indus)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.972 -2.698 -0.736 0.712 81.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.06374 0.66723 -3.093 0.00209 **
## indus 0.50978 0.05102 9.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.866 on 504 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1637
## F-statistic: 99.82 on 1 and 504 DF, p-value: < 2.2e-16
CHAS : Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
## no
lm.chas = lm(crim~chas)
summary(lm.chas)
##
## Call:
## lm(formula = crim ~ chas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.738 -3.661 -3.435 0.018 85.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7444 0.3961 9.453 <2e-16 ***
## chasY -1.8928 1.5061 -1.257 0.209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.597 on 504 degrees of freedom
## Multiple R-squared: 0.003124, Adjusted R-squared: 0.001146
## F-statistic: 1.579 on 1 and 504 DF, p-value: 0.2094
NOX : nitrogen oxides concentration (parts per 10 million).
## yes
lm.nox = lm(crim~nox)
summary(lm.nox)
##
## Call:
## lm(formula = crim ~ nox)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.371 -2.738 -0.974 0.559 81.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.720 1.699 -8.073 5.08e-15 ***
## nox 31.249 2.999 10.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.81 on 504 degrees of freedom
## Multiple R-squared: 0.1772, Adjusted R-squared: 0.1756
## F-statistic: 108.6 on 1 and 504 DF, p-value: < 2.2e-16
RM : average number of rooms per dwelling.
## yes
lm.rm = lm(crim~rm)
summary(lm.rm)
##
## Call:
## lm(formula = crim ~ rm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.604 -3.952 -2.654 0.989 87.197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.482 3.365 6.088 2.27e-09 ***
## rm -2.684 0.532 -5.045 6.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.401 on 504 degrees of freedom
## Multiple R-squared: 0.04807, Adjusted R-squared: 0.04618
## F-statistic: 25.45 on 1 and 504 DF, p-value: 6.347e-07
AGE : proportion of owner-occupied units built prior to 1940.
## yes
lm.age = lm(crim~age)
summary(lm.age)
##
## Call:
## lm(formula = crim ~ age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.789 -4.257 -1.230 1.527 82.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.77791 0.94398 -4.002 7.22e-05 ***
## age 0.10779 0.01274 8.463 2.85e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1227
## F-statistic: 71.62 on 1 and 504 DF, p-value: 2.855e-16
DIS : weighted mean of distances to five Boston employment centres.
## yes
lm.dis = lm(crim~dis)
summary(lm.dis)
##
## Call:
## lm(formula = crim ~ dis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.708 -4.134 -1.527 1.516 81.674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4993 0.7304 13.006 <2e-16 ***
## dis -1.5509 0.1683 -9.213 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.965 on 504 degrees of freedom
## Multiple R-squared: 0.1441, Adjusted R-squared: 0.1425
## F-statistic: 84.89 on 1 and 504 DF, p-value: < 2.2e-16
RAD : index of accessibility to radial highways.
## yes
lm.rad = lm(crim~rad)
summary(lm.rad)
##
## Call:
## lm(formula = crim ~ rad)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.164 -1.381 -0.141 0.660 76.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.28716 0.44348 -5.157 3.61e-07 ***
## rad 0.61791 0.03433 17.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.718 on 504 degrees of freedom
## Multiple R-squared: 0.3913, Adjusted R-squared: 0.39
## F-statistic: 323.9 on 1 and 504 DF, p-value: < 2.2e-16
TAX : full-value property-tax rate per $10,000.
## yes
lm.tax = lm(crim~tax)
summary(lm.tax)
##
## Call:
## lm(formula = crim ~ tax)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.513 -2.738 -0.194 1.065 77.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.528369 0.815809 -10.45 <2e-16 ***
## tax 0.029742 0.001847 16.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.997 on 504 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3383
## F-statistic: 259.2 on 1 and 504 DF, p-value: < 2.2e-16
PTRATIO : pupil-teacher ratio by town.
## yes
lm.ptratio = lm(crim~ptratio)
summary(lm.ptratio)
##
## Call:
## lm(formula = crim ~ ptratio)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.654 -3.985 -1.912 1.825 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6469 3.1473 -5.607 3.40e-08 ***
## ptratio 1.1520 0.1694 6.801 2.94e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.24 on 504 degrees of freedom
## Multiple R-squared: 0.08407, Adjusted R-squared: 0.08225
## F-statistic: 46.26 on 1 and 504 DF, p-value: 2.943e-11
BLACK : 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
## yes
lm.black = lm(crim~black)
summary(lm.black)
##
## Call:
## lm(formula = crim ~ black)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.756 -2.299 -2.095 -1.296 86.822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.553529 1.425903 11.609 <2e-16 ***
## black -0.036280 0.003873 -9.367 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.946 on 504 degrees of freedom
## Multiple R-squared: 0.1483, Adjusted R-squared: 0.1466
## F-statistic: 87.74 on 1 and 504 DF, p-value: < 2.2e-16
LSTAT : lower status of the population (percent).
## yes
lm.lstat = lm(crim~lstat)
summary(lm.lstat)
##
## Call:
## lm(formula = crim ~ lstat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.925 -2.822 -0.664 1.079 82.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.33054 0.69376 -4.801 2.09e-06 ***
## lstat 0.54880 0.04776 11.491 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.664 on 504 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: 0.206
## F-statistic: 132 on 1 and 504 DF, p-value: < 2.2e-16
MEDV : median value of owner-occupied homes in $1000s.
lm.medv = lm(crim~medv)
summary(lm.medv)
##
## Call:
## lm(formula = crim ~ medv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.071 -4.022 -2.343 1.298 80.957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.79654 0.93419 12.63 <2e-16 ***
## medv -0.36316 0.03839 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.934 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
chas
변수를 제외하고 모두 계수의 유의성 검정 결과 유의하다고 결론지었다.
(b) 모든 독립변수들을 사용하여 다중 선형 회귀 모형을 적합하여라. 결과를 설명하고 어떤 독립변수들이 T-검정 하의 단일 계수의 유의성 검정 결과 유의한가?
95% 신뢰 수준에서 T-검정으로 계수의 유의성에 대해 검정한 결과 zn
, dis
, rad
, black
, medv
다섯 가지 변수들이 통계적으로 유의하다고 결론지었다.
lm.all = lm(crim~., data=Boston)
summary(lm.all)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chasY -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
(c) 단순선형회귀에서의 계수 값들을 X축에, 다중선형회귀에서의 계수 값들을 Y축에 그려내 비교하여라.
x = c(coefficients(lm.zn)[2],
coefficients(lm.indus)[2],
coefficients(lm.chas)[2],
coefficients(lm.nox)[2],
coefficients(lm.rm)[2],
coefficients(lm.age)[2],
coefficients(lm.dis)[2],
coefficients(lm.rad)[2],
coefficients(lm.tax)[2],
coefficients(lm.ptratio)[2],
coefficients(lm.black)[2],
coefficients(lm.lstat)[2],
coefficients(lm.medv)[2])
y = coefficients(lm.all)[2:14]
z = names(Boston)[2:14]
c=data.frame(x=x,y=y,z=z)
library(ggplot2)
ggplot(c)+
geom_point(aes(x=x,y=y,color=z),size=2)+
theme_classic()+
xlab('Simple Linear Regression Coefficient Estimates')+
ylab('Multiple Linear Regression Coefficient Estimates')+
ggtitle('Comparison of Coefficient Estimates')
nox
변수의 경우 단순 선형 회귀 모형에서의 계수가 약 31 이었던 반면 다중선형회귀에서는 약 -10 의 값을 갖는 눈에 띄는 차이가 발견되었다.
(d) 독립변수와 종속변수 간의 비선형 관계가 존재한다는 증거가 있는가? 이를 확인하기 위해 3차 다항 회귀를 적합하여라.
앞서 그렸던 종속변수 crim
에 대해 각각의 독립변수를 그려낸 결과를 통해서도 비선형 관계가 존재한다는 것을 알 수 있다.
featurePlot(x=Boston[,preds],y=Boston$crim,pch=20,cex=1.2)
우선 문제에서 주어진 대로 3차 다항회귀를 적합한 결과는 다음과 같다.
zn3=lm(crim~poly(zn,3),Boston)
indus3=lm(crim~poly(indus,3),Boston)
nox3=lm(crim~poly(nox,3),Boston)
rm3=lm(crim~poly(rm,3),Boston)
age3=lm(crim~poly(age,3),Boston)
dis3=lm(crim~poly(dis,3),Boston)
rad3=lm(crim~poly(rad,3),Boston)
tax3=lm(crim~poly(tax,3),Boston)
ptratio3=lm(crim~poly(ptratio,3),Boston)
black3=lm(crim~poly(black,3),Boston)
lstat3=lm(crim~poly(lstat,3),Boston)
medv3=lm(crim~poly(medv,3),Boston)
ZN
## 1,2
summary(zn3)
##
## Call:
## lm(formula = crim ~ poly(zn, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.821 -4.614 -1.294 0.473 84.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3722 9.709 < 2e-16 ***
## poly(zn, 3)1 -38.7498 8.3722 -4.628 4.7e-06 ***
## poly(zn, 3)2 23.9398 8.3722 2.859 0.00442 **
## poly(zn, 3)3 -10.0719 8.3722 -1.203 0.22954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared: 0.05824, Adjusted R-squared: 0.05261
## F-statistic: 10.35 on 3 and 502 DF, p-value: 1.281e-06
INDUS
## 1,2,3
summary(indus3)
##
## Call:
## lm(formula = crim ~ poly(indus, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.278 -2.514 0.054 0.764 79.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.330 10.950 < 2e-16 ***
## poly(indus, 3)1 78.591 7.423 10.587 < 2e-16 ***
## poly(indus, 3)2 -24.395 7.423 -3.286 0.00109 **
## poly(indus, 3)3 -54.130 7.423 -7.292 1.2e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2552
## F-statistic: 58.69 on 3 and 502 DF, p-value: < 2.2e-16
NOX
## 1,2,3
summary(nox3)
##
## Call:
## lm(formula = crim ~ poly(nox, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.110 -2.068 -0.255 0.739 78.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3216 11.237 < 2e-16 ***
## poly(nox, 3)1 81.3720 7.2336 11.249 < 2e-16 ***
## poly(nox, 3)2 -28.8286 7.2336 -3.985 7.74e-05 ***
## poly(nox, 3)3 -60.3619 7.2336 -8.345 6.96e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.2928
## F-statistic: 70.69 on 3 and 502 DF, p-value: < 2.2e-16
RM
## 1,2
summary(rm3)
##
## Call:
## lm(formula = crim ~ poly(rm, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.485 -3.468 -2.221 -0.015 87.219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3703 9.758 < 2e-16 ***
## poly(rm, 3)1 -42.3794 8.3297 -5.088 5.13e-07 ***
## poly(rm, 3)2 26.5768 8.3297 3.191 0.00151 **
## poly(rm, 3)3 -5.5103 8.3297 -0.662 0.50858
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared: 0.06779, Adjusted R-squared: 0.06222
## F-statistic: 12.17 on 3 and 502 DF, p-value: 1.067e-07
AGE
## 1,2,3
summary(age3)
##
## Call:
## lm(formula = crim ~ poly(age, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.762 -2.673 -0.516 0.019 82.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3485 10.368 < 2e-16 ***
## poly(age, 3)1 68.1820 7.8397 8.697 < 2e-16 ***
## poly(age, 3)2 37.4845 7.8397 4.781 2.29e-06 ***
## poly(age, 3)3 21.3532 7.8397 2.724 0.00668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared: 0.1742, Adjusted R-squared: 0.1693
## F-statistic: 35.31 on 3 and 502 DF, p-value: < 2.2e-16
DIS
## 1,2,3
summary(dis3)
##
## Call:
## lm(formula = crim ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.757 -2.588 0.031 1.267 76.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3259 11.087 < 2e-16 ***
## poly(dis, 3)1 -73.3886 7.3315 -10.010 < 2e-16 ***
## poly(dis, 3)2 56.3730 7.3315 7.689 7.87e-14 ***
## poly(dis, 3)3 -42.6219 7.3315 -5.814 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2735
## F-statistic: 64.37 on 3 and 502 DF, p-value: < 2.2e-16
RAD
## 1,2
summary(rad3)
##
## Call:
## lm(formula = crim ~ poly(rad, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.381 -0.412 -0.269 0.179 76.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.2971 12.164 < 2e-16 ***
## poly(rad, 3)1 120.9074 6.6824 18.093 < 2e-16 ***
## poly(rad, 3)2 17.4923 6.6824 2.618 0.00912 **
## poly(rad, 3)3 4.6985 6.6824 0.703 0.48231
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3965
## F-statistic: 111.6 on 3 and 502 DF, p-value: < 2.2e-16
TAX
## 1,2
summary(tax3)
##
## Call:
## lm(formula = crim ~ poly(tax, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.273 -1.389 0.046 0.536 76.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3047 11.860 < 2e-16 ***
## poly(tax, 3)1 112.6458 6.8537 16.436 < 2e-16 ***
## poly(tax, 3)2 32.0873 6.8537 4.682 3.67e-06 ***
## poly(tax, 3)3 -7.9968 6.8537 -1.167 0.244
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3651
## F-statistic: 97.8 on 3 and 502 DF, p-value: < 2.2e-16
PTRATIO
## 1,2,3
summary(ptratio3)
##
## Call:
## lm(formula = crim ~ poly(ptratio, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.833 -4.146 -1.655 1.408 82.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.361 10.008 < 2e-16 ***
## poly(ptratio, 3)1 56.045 8.122 6.901 1.57e-11 ***
## poly(ptratio, 3)2 24.775 8.122 3.050 0.00241 **
## poly(ptratio, 3)3 -22.280 8.122 -2.743 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared: 0.1138, Adjusted R-squared: 0.1085
## F-statistic: 21.48 on 3 and 502 DF, p-value: 4.171e-13
BLACK
## 1
summary(black3)
##
## Call:
## lm(formula = crim ~ poly(black, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.096 -2.343 -2.128 -1.439 86.790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3536 10.218 <2e-16 ***
## poly(black, 3)1 -74.4312 7.9546 -9.357 <2e-16 ***
## poly(black, 3)2 5.9264 7.9546 0.745 0.457
## poly(black, 3)3 -4.8346 7.9546 -0.608 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.955 on 502 degrees of freedom
## Multiple R-squared: 0.1498, Adjusted R-squared: 0.1448
## F-statistic: 29.49 on 3 and 502 DF, p-value: < 2.2e-16
LSTAT
## 1,2
summary(lstat3)
##
## Call:
## lm(formula = crim ~ poly(lstat, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.234 -2.151 -0.486 0.066 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6135 0.3392 10.654 <2e-16 ***
## poly(lstat, 3)1 88.0697 7.6294 11.543 <2e-16 ***
## poly(lstat, 3)2 15.8882 7.6294 2.082 0.0378 *
## poly(lstat, 3)3 -11.5740 7.6294 -1.517 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared: 0.2179, Adjusted R-squared: 0.2133
## F-statistic: 46.63 on 3 and 502 DF, p-value: < 2.2e-16
MEDV
## 1,2,3
summary(medv3)
##
## Call:
## lm(formula = crim ~ poly(medv, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.427 -1.976 -0.437 0.439 73.655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614 0.292 12.374 < 2e-16 ***
## poly(medv, 3)1 -75.058 6.569 -11.426 < 2e-16 ***
## poly(medv, 3)2 88.086 6.569 13.409 < 2e-16 ***
## poly(medv, 3)3 -48.033 6.569 -7.312 1.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4167
## F-statistic: 121.3 on 3 and 502 DF, p-value: < 2.2e-16
Modeling
Train ,Test Split
최종 모델을 만들고 성능을 최대화하기 위해 Train, Test Set을 80:20 으로 split 하였다.
## train test split
set.seed(0)
idx=createDataPartition(Boston$crim,p=0.8, list=FALSE)
train=Boston[idx,]
test=Boston[-idx,]
cat('train :', dim(train),'\n')
## train : 406 14
cat('test :',dim(test),'\n')
## test : 100 14
앞서 적합했던 모든 변수를 활용한 다중선형회귀 모형으로 적합하여 RMSE를 정의하고 Adjusted R-Squared, Training MSE, Test RMSE를 계산한 결과는 다음과 같다.
## RMSE
rmse=function(pred,test){
return (sqrt(mean((pred-test)^2)))
}
## multiple linear regression model 0
lm.all=lm(crim~.,data=train)
summary(lm.all)
##
## Call:
## lm(formula = crim ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.585 -2.056 -0.314 0.960 75.271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.892402 8.071437 1.845 0.06578 .
## zn 0.037134 0.020486 1.813 0.07064 .
## indus -0.065881 0.090542 -0.728 0.46728
## chasY -0.717544 1.326687 -0.541 0.58892
## nox -10.430999 5.937637 -1.757 0.07974 .
## rm 0.680576 0.663050 1.026 0.30532
## age -0.001330 0.020322 -0.065 0.94785
## dis -0.845642 0.307731 -2.748 0.00627 **
## rad 0.584262 0.101421 5.761 1.69e-08 ***
## tax -0.003118 0.005808 -0.537 0.59174
## ptratio -0.250424 0.208977 -1.198 0.23151
## black -0.012120 0.004066 -2.981 0.00306 **
## lstat 0.187297 0.085497 2.191 0.02906 *
## medv -0.165184 0.064730 -2.552 0.01109 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.497 on 392 degrees of freedom
## Multiple R-squared: 0.468, Adjusted R-squared: 0.4503
## F-statistic: 26.52 on 13 and 392 DF, p-value: < 2.2e-16
##
cat('Adjusted R-Squared :',summary(lm.all)$adj.r.squared,'\n')
## Adjusted R-Squared : 0.4503116
train_y=predict(lm.all,newdata=train)
test_y=predict(lm.all,newdata=test)
##
cat('Training RMSE : ',rmse(train_y,train$crim),'\n')
## Training RMSE : 6.383586
cat('Test RMSE : ',rmse(test_y,test$crim),'\n')
## Test RMSE : 6.318677
이제 모형의 개선을 위해 먼저 lm.all
의 잔차도를 살펴보았다.
par(mfrow=c(2,2))
plot(lm.all,pch=20,col='lightgray',cex=1.5)
잔차의 분산이 증가하는 추세로 판단되어 이를 해결하기 위해 종속변수의 Concave 함수 변환을 고려하였다.
Log Transformation
featurePlot(x=Boston[,preds],y=log(Boston$crim),pch=20,cex=1.2)
변환 결과 눈에 띄는 독립, 종속 변수간 선형성이 확보되었다.
성능 측면에서도, 모든 독립변수를 사용하고 종속 변수 crim
만 log 변환하여 다중선형회귀를 적합한 결과 모형의 R-Squared 는 크게 증가하였지만 RMSE는 조금 증가하는 모습을 보였다.
## multiple linear regression model 0
lm.alllog=lm(log(crim)~.,data=train)
##
cat('Adjusted R-Squared :',summary(lm.alllog)$adj.r.squared,'\n')
## Adjusted R-Squared : 0.8729427
train_y=predict(lm.alllog,newdata=train)
test_y=predict(lm.alllog,newdata=test)
##
cat('Training RMSE : ',rmse(exp(train_y),train$crim),'\n')
## Training RMSE : 6.61039
cat('Test RMSE : ',rmse(exp(test_y),test$crim),'\n')
## Test RMSE : 6.60867
또한 앞서 확인했던 Collinearity 문제를 살펴보자.
corrplot(cor(Boston[,preds]),method='square')
변수 간의 다중공선성이 존재한다. 이를 vif
함수를 이용해 찾아보겠다.
library(car)
## Loading required package: carData
vif(lm.alllog)
## zn indus chas nox rm age dis rad
## 2.308185 3.729722 1.087154 4.470080 2.242700 3.121104 4.132339 7.443892
## tax ptratio black lstat medv
## 9.125379 1.991624 1.337849 3.549093 3.671576
독립변수의 분산팽창계수값이 출력되었다. 값을 보니 10을 넘는 독립변수는 존재하지 않는다.
Additional Variable Transformations
Sqrt(LSTAT)
plot(sqrt(Boston$lstat),log(Boston$crim))
Sqrt(MEDV)
plot(sqrt(medv),log(Boston$crim))
Log(RAD)
plot(log(rad),log(Boston$crim))
Log(TAX)
plot(log(tax),log(Boston$crim))
Sqrt(AGE)
plot(sqrt((age)),log(Boston$crim))
Log(DIS)
plot(log(dis),log(Boston$crim))
Log(ZN)
plot(log(zn),log(Boston$crim))
Log(INDUS)
plot(log(indus),log(Boston$crim))
Log(NOX)
plot(log(nox),log(Boston$crim))
lm.final1=lm(log(crim)~sqrt(lstat)+sqrt(medv)+log(rad)+log(tax)+sqrt(age)+log(dis)+(zn)+log(indus)+log(nox),data=train)
summary(lm.final1)
##
## Call:
## lm(formula = log(crim) ~ sqrt(lstat) + sqrt(medv) + log(rad) +
## log(tax) + sqrt(age) + log(dis) + (zn) + log(indus) + log(nox),
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0719 -0.5195 -0.0594 0.5055 2.2962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.864934 1.495321 -4.591 5.93e-06 ***
## sqrt(lstat) 0.126453 0.082002 1.542 0.1239
## sqrt(medv) -0.018809 0.079528 -0.237 0.8132
## log(rad) 1.056101 0.082449 12.809 < 2e-16 ***
## log(tax) 0.872048 0.208532 4.182 3.56e-05 ***
## sqrt(age) 0.040039 0.034053 1.176 0.2404
## log(dis) -0.314417 0.167572 -1.876 0.0613 .
## zn -0.005383 0.002439 -2.207 0.0279 *
## log(indus) 0.140990 0.093822 1.503 0.1337
## log(nox) 2.570235 0.452253 5.683 2.57e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7894 on 396 degrees of freedom
## Multiple R-squared: 0.8695, Adjusted R-squared: 0.8665
## F-statistic: 293.1 on 9 and 396 DF, p-value: < 2.2e-16
lm.final2=lm(log(crim)~log(rad)*zn+log(tax)+log(dis)+log(nox),data=train)
summary(lm.final2)
##
## Call:
## lm(formula = log(crim) ~ log(rad) * zn + log(tax) + log(dis) +
## log(nox), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.96582 -0.50396 -0.00454 0.49870 2.24887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.953929 1.161620 -4.265 2.50e-05 ***
## log(rad) 1.231726 0.091695 13.433 < 2e-16 ***
## zn 0.007350 0.004186 1.756 0.079889 .
## log(tax) 0.724318 0.203846 3.553 0.000426 ***
## log(dis) -0.378470 0.145468 -2.602 0.009620 **
## log(nox) 2.997089 0.421411 7.112 5.33e-12 ***
## log(rad):zn -0.013934 0.003106 -4.487 9.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7809 on 399 degrees of freedom
## Multiple R-squared: 0.8713, Adjusted R-squared: 0.8694
## F-statistic: 450.2 on 6 and 399 DF, p-value: < 2.2e-16
cat('Adjusted R-Squared :',summary(lm.final2)$adj.r.squared,'\n')
## Adjusted R-Squared : 0.8693522
train_y=predict(lm.final2,newdata=train)
test_y=predict(lm.final2,newdata=test)
##
cat('Training RMSE : ',rmse(exp(train_y),train$crim),'\n')
## Training RMSE : 6.954659
cat('Test RMSE : ',rmse(exp(test_y),test$crim),'\n')
## Test RMSE : 6.103154