讀取資料與初步整理
library(sf);library(spdep);library(RColorBrewer);library(units);library(spgwr);library("randomForest")
setwd("D:/SA_Book/Lab_Data_Ch9")
TOWN=st_read(dsn="TNN_KAOH_dengue2015_factor.shp")
問題9-1:最小平方法(OLS)
2015年台南市與高雄市的登革熱確診案例多集中於哪些地區?確診案例在空間中的不均分布可能與各地的社會環境條件有關。請利用複回歸模式,以最小平方法(OLS)找出哪些社會因子與研究區的登革熱確診案例數在統計上呈顯著相關,並說明是如何相關。可能的社會因子包括人口密度、扶養比、高等教育程度以及平均收入(請在不考慮人口效果的假設下,以登革熱確診病例數為依變項進行分析)。
OLS=lm(DEN_CNT~POP_DEN+COLLEGEA_P+TAX_AVERAG+DEPND_RATI,TOWN)
summary(OLS)
##
## Call:
## lm(formula = DEN_CNT ~ POP_DEN + COLLEGEA_P + TAX_AVERAG + DEPND_RATI,
## data = TOWN)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1438.4 -236.3 -44.8 97.6 3689.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -799.31948 965.25391 -0.828 0.4104
## POP_DEN 0.06403 0.02735 2.341 0.0221 *
## COLLEGEA_P 8243.46132 1710.71065 4.819 8.1e-06 ***
## TAX_AVERAG 1.01347 0.82016 1.236 0.2207
## DEPND_RATI 33.46594 1690.74946 0.020 0.9843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 715.1 on 70 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.6172
## F-statistic: 30.83 on 4 and 70 DF, p-value: 8.466e-15
報表與圖9-3中ArcGIS PRO之報表相同。
圖9-4可用以下程式碼來畫出該欄位直方圖,及與應變數的散布圖,以下使用POP_DEN人口密度當範例。
attach(TOWN)
par(mfrow=c(1,2))
hist(POP_DEN)
plot(DEN_CNT~POP_DEN);abline(lm(DEN_CNT~POP_DEN), col="red")
問題9-2:OLS殘差的空間自相關檢定
根據多元迴歸分析的結果,請觀察其殘差的分布狀況,哪些地區的登革熱病例數在模式中被高估?哪些被低估?請以Moran’s I檢定殘差的空間趨勢特徵,評估是否有進行地理加權迴歸(GWR)的必要性。
殘差空間分布
# 繪製殘差空間分布圖
TOWN$residuals=rstandard(OLS) #標準化殘差
plot(TOWN['residuals'])
# 分成七類繪圖
par(mar=c(0,0,0,0))
col7=rev(brewer.pal(7,"RdBu"))
TOWN$res_col=col7[cut(TOWN$residuals,c(-Inf,-2.5,-1.5,-0.5,0.5,1.5,2.5,Inf))]
plot(st_geometry(TOWN),col=TOWN$res_col)
legend("bottomright",legend=c("2.5以上","1.5~2.5","0.5~1.5","-0.5~0.5","-1.5~-0.5","-2.5~-1.5","-2.5以下"), fill=rev(col7),bty="n",cex=0.7,y.intersp=1,x.intersp=1)
空間自相關判定
# 操作請參考7-1
center=st_centroid(TOWN)
dist=st_distance(center)
dist=drop_units(dist)
weight=1/dist
weight[dist>14012]=0
diag(weight)=0
TOWN.nbw=mat2listw(weight,style="W")
# moran.test函數:計算Moran's I指數
moran.test(TOWN$residuals,TOWN.nbw)
##
## Moran I test under randomisation
##
## data: TOWN$residuals
## weights: TOWN.nbw
##
## Moran I statistic standard deviate = 6.4374, p-value = 6.079e-11
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.356636072 -0.013513514 0.003306281
計算結果Moran’s I指數為0.3566,Z分數為6.4374,p值為0。因此為顯著群聚。
與ArcGIS PRO的數值有些許不同,是殘差進行標準化時的計算方法不同。 若使用未標準化的殘差(OLS$residuals)進行運算,則會與ArcGIS PRO的數值結果相同,如下報表。
moran.test(OLS$residuals,TOWN.nbw)
##
## Moran I test under randomisation
##
## data: OLS$residuals
## weights: TOWN.nbw
##
## Moran I statistic standard deviate = 6.3548, p-value = 1.043e-10
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.349120030 -0.013513514 0.003256337
計算結果Moran’s I指數為0.3491,Z分數為6.3548。
問題9-3:最佳化參數設定與統計顯著性校正
根據最小平方法(OLS)的殘差空間自相關檢定結果,當殘差在空間中並無呈隨機分布時,可嘗試利用地理加權迴歸(GWR)來分析因子間的相關性在不同地區間的差異。請問,哪些社會因子與地區的登革熱病例數有顯著相關?其相關性強度的空間分布為何?
地理加權迴歸
#轉換成sp格式
TOWN.sp=as_Spatial(TOWN)
#最佳寬帶(bandwidth)評估,亦可照書中直接設定為20000公尺
GWRbandwidth = gwr.sel(DEN_CNT~POP_DEN+COLLEGEA_P+TAX_AVERAG+DEPND_RATI, data = TOWN.sp, gweight = gwr.Gauss, verbose = FALSE)
#地理加權迴歸(GWR)
GWRmodel = gwr(DEN_CNT~POP_DEN+COLLEGEA_P+TAX_AVERAG+DEPND_RATI, data = TOWN.sp, bandwidth = GWRbandwidth,gweight = gwr.Gauss, hatmatrix = TRUE)
#數值結果表
results = as.data.frame(GWRmodel$SDF)
#報表
GWRmodel
## Call:
## gwr(formula = DEN_CNT ~ POP_DEN + COLLEGEA_P + TAX_AVERAG + DEPND_RATI,
## data = TOWN.sp, bandwidth = GWRbandwidth, gweight = gwr.Gauss,
## hatmatrix = TRUE)
## Kernel function: gwr.Gauss
## Fixed bandwidth: 16626.92
## Summary of GWR coefficient estimates at data points:
## Min. 1st Qu. Median 3rd Qu. Max.
## X.Intercept. -1.4471e+03 -4.5971e+02 -7.6079e+01 8.6030e+02 1.3715e+03
## POP_DEN 3.6606e-02 4.2216e-02 1.9243e-01 2.9995e-01 3.4078e-01
## COLLEGEA_P -7.3431e+02 6.7114e+03 8.1290e+03 8.6428e+03 1.0127e+04
## TAX_AVERAG -2.3914e+00 -1.4660e+00 -1.6987e-01 2.6681e-01 1.5401e+00
## DEPND_RATI -3.5468e+02 1.2717e+02 3.6197e+02 5.2460e+02 8.6940e+02
## Global
## X.Intercept. -799.3195
## POP_DEN 0.0640
## COLLEGEA_P 8243.4613
## TAX_AVERAG 1.0135
## DEPND_RATI 33.4659
## Number of data points: 75
## Effective number of parameters (residual: 2traceS - traceS'S): 18.77586
## Effective degrees of freedom (residual: 2traceS - traceS'S): 56.22414
## Sigma (residual: 2traceS - traceS'S): 524.9061
## Effective number of parameters (model: traceS): 14.79194
## Effective degrees of freedom (model: traceS): 60.20806
## Sigma (model: traceS): 507.2426
## Sigma (ML): 454.4775
## AICc (GWR p. 61, eq 2.33; p. 96, eq. 4.21): 1171.408
## AIC (GWR p. 96, eq. 4.22): 1145.505
## Residual sum of squares: 15491237
## Quasi-global R2: 0.8432926
地理加權迴歸係數的空間分布
#人口密度
TOWN$GWR.POP_DEN=results$POP_DEN
plot(TOWN['GWR.POP_DEN'],pal=hcl.colors(7,"YlOrRd",rev=T))
#高等教育程度
TOWN$GWR.COLLEGEA_P=results$COLLEGEA_P
plot(TOWN['GWR.COLLEGEA_P'],pal=hcl.colors(12,"YlOrRd",rev=T))
殘差空間相依性
#標準化殘差
TOWN$resGWR=results$gwr.e/sd(results$gwr.e)
plot(TOWN['resGWR'])
#分七層著色
par(mar=c(0,0,0,0))
TOWN$resGWR_col=col7[cut(TOWN$resGWR,c(-Inf,-2.5,-1.5,-0.5,0.5,1.5,2.5,Inf))]
plot(st_geometry(TOWN),col=TOWN$resGWR_col)
legend("bottomright",legend=c("2.5以上","1.5~2.5","0.5~1.5","-0.5~0.5","-1.5~-0.5","-2.5~-1.5","-2.5以下"), fill=rev(col7),bty="n",cex=0.7,y.intersp=1,x.intersp=1)
#空間=自相關檢定
moran.test(TOWN$resGWR,TOWN.nbw)
##
## Moran I test under randomisation
##
## data: TOWN$resGWR
## weights: TOWN.nbw
##
## Moran I statistic standard deviate = -0.25304, p-value = 0.5999
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.028383433 -0.013513514 0.003453377
問題9-4:隨機森林的預測模式
請比較複回歸與地理加權回歸模式的R-square值,並改以隨機森林模式建立預測各地區登革熱發病病例數的機器學習模式。根據模式結果,哪些社會因子的重要性較高?此結果與回歸分析的結果是否一致?請問隨機森林的R-square值為何?是否有比統計模型結果的R-square值更高?
隨機森林模型
RF = randomForest(DEN_CNT~POP_DEN+COLLEGEA_P+TAX_AVERAG+DEPND_RATI,TOWN,ntree=100)
RF
##
## Call:
## randomForest(formula = DEN_CNT ~ POP_DEN + COLLEGEA_P + TAX_AVERAG + DEPND_RATI, data = TOWN, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 595407.5
## % Var explained: 54.83
變數重要性
importance(RF)
## IncNodePurity
## POP_DEN 26805966
## COLLEGEA_P 30820874
## TAX_AVERAG 24416897
## DEPND_RATI 10150134
importance(RF)/sum(importance(RF))*100 #相對值
## IncNodePurity
## POP_DEN 29.07565
## COLLEGEA_P 33.43050
## TAX_AVERAG 26.48430
## DEPND_RATI 11.00955