summary(powerTransform(lm_sales_multi$fit))#> bcPower Transformation to Normality #> Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd#> Y1 0.9074 1 0.7569 1.0578#> #> Likelihood ratio test that transformation parameter is equal to 0#> (log transformation)#> LRT df pval#> LR test, lambda = (0) 147.1578 1 < 2.22e-16#> #> Likelihood ratio test that no transformation is needed#> LRT df pval#> LR test, lambda = (1) 1.41991 1 0.23342
#######################################################################library(car)outlierTest(lm_sales_multi$fit)#离群点#高杠杆值点hat.plot<-function(fit){p<-length(coefficients(fit))#模型估计的参数数目(包含截距项)n<-length(fitted(fit))#样本量plot(hatvalues(fit),main="Index Plot of Hat Values")#帽子值abline(h=c(2,3)*p/n,col="red",lty=2)#大于帽子均值p/n的2或3倍被认为是高杠杆值identity(1:n,hatvalues(fit),names(hatvalues(fit)))}hat.plot(lm_sales_multi$fit)####强影响点#Cook's D图形 大于4/(n-k-1) k为预测变量数目cutoff<-4/(nrow(advertising)-length(lm_sales_multi$fit$coefficients)-2){plot(lm_sales_multi$fit,which=4,cook.levels=cutoff)abline(h=cutoff,lty=2,col="red")}#变量添加图avPlots(lm_sales_multi$fit,ask=FALSE,id.method="identity")###influencePlot(lm_sales_multi$fit,id.method="identity",main="Influence Plot")
17.4.6 多重共线性
Code
vif(lm_sales_multi$fit)#> TV radio newspaper #> 1.004611 1.144952 1.145187sqrt(vif(lm_sales_multi$fit))>=2#vif平方根 ≥2 存在#> TV radio newspaper #> FALSE FALSE FALSE
17.5 逐步回归
逐步回归是筛选变量,有向前、向后和两个方向同时进行三个方法。
direction = "both"双向
direction = "backward"向后
direction = "forward"向前
Code
step_full<-lm(sales~. ,data =advertising[-1])step_lm_0<-lm(sales~1 ,data =advertising[-1])step_forward<-stats::step(step_lm_0,scope =formula(step_full), direction ="forward")#> Start: AIC=661.8#> sales ~ 1#> #> Df Sum of Sq RSS AIC#> + TV 1 3314.6 2102.5 474.52#> + radio 1 1798.7 3618.5 583.10#> + newspaper 1 282.3 5134.8 653.10#> <none> 5417.1 661.80#> #> Step: AIC=474.52#> sales ~ TV#> #> Df Sum of Sq RSS AIC#> + radio 1 1545.62 556.91 210.82#> + newspaper 1 183.97 1918.56 458.20#> <none> 2102.53 474.52#> #> Step: AIC=210.82#> sales ~ TV + radio#> #> Df Sum of Sq RSS AIC#> <none> 556.91 210.82#> + newspaper 1 0.088717 556.83 212.79summary(step_forward)#> #> Call:#> lm(formula = sales ~ TV + radio, data = advertising[-1])#> #> Residuals:#> Min 1Q Median 3Q Max #> -8.7977 -0.8752 0.2422 1.1708 2.8328 #> #> Coefficients:#> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 2.92110 0.29449 9.919 <2e-16 ***#> TV 0.04575 0.00139 32.909 <2e-16 ***#> radio 0.18799 0.00804 23.382 <2e-16 ***#> ---#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#> #> Residual standard error: 1.681 on 197 degrees of freedom#> Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962 #> F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16step_backward<-stats::step(object =step_full,#scope = formula(step_lm_0) , direction ="backward")#> Start: AIC=212.79#> sales ~ TV + radio + newspaper#> #> Df Sum of Sq RSS AIC#> - newspaper 1 0.09 556.9 210.82#> <none> 556.8 212.79#> - radio 1 1361.74 1918.6 458.20#> - TV 1 3058.01 3614.8 584.90#> #> Step: AIC=210.82#> sales ~ TV + radio#> #> Df Sum of Sq RSS AIC#> <none> 556.9 210.82#> - radio 1 1545.6 2102.5 474.52#> - TV 1 3061.6 3618.5 583.10summary(step_backward)#> #> Call:#> lm(formula = sales ~ TV + radio, data = advertising[-1])#> #> Residuals:#> Min 1Q Median 3Q Max #> -8.7977 -0.8752 0.2422 1.1708 2.8328 #> #> Coefficients:#> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 2.92110 0.29449 9.919 <2e-16 ***#> TV 0.04575 0.00139 32.909 <2e-16 ***#> radio 0.18799 0.00804 23.382 <2e-16 ***#> ---#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#> #> Residual standard error: 1.681 on 197 degrees of freedom#> Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962 #> F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16step_both<-stats::step(object =step_lm_0, scope =formula(step_full) , direction ="both")#> Start: AIC=661.8#> sales ~ 1#> #> Df Sum of Sq RSS AIC#> + TV 1 3314.6 2102.5 474.52#> + radio 1 1798.7 3618.5 583.10#> + newspaper 1 282.3 5134.8 653.10#> <none> 5417.1 661.80#> #> Step: AIC=474.52#> sales ~ TV#> #> Df Sum of Sq RSS AIC#> + radio 1 1545.6 556.9 210.82#> + newspaper 1 184.0 1918.6 458.20#> <none> 2102.5 474.52#> - TV 1 3314.6 5417.1 661.80#> #> Step: AIC=210.82#> sales ~ TV + radio#> #> Df Sum of Sq RSS AIC#> <none> 556.9 210.82#> + newspaper 1 0.09 556.8 212.79#> - radio 1 1545.62 2102.5 474.52#> - TV 1 3061.57 3618.5 583.10summary(step_both)#> #> Call:#> lm(formula = sales ~ TV + radio, data = advertising[-1])#> #> Residuals:#> Min 1Q Median 3Q Max #> -8.7977 -0.8752 0.2422 1.1708 2.8328 #> #> Coefficients:#> Estimate Std. Error t value Pr(>|t|) #> (Intercept) 2.92110 0.29449 9.919 <2e-16 ***#> TV 0.04575 0.00139 32.909 <2e-16 ***#> radio 0.18799 0.00804 23.382 <2e-16 ***#> ---#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1#> #> Residual standard error: 1.681 on 197 degrees of freedom#> Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962 #> F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16