初步環境建置與讀取檔案、資料處理、設定鄰居
rm(list = ls())
library(rgdal); library(GISTools); library(ggtern); library(dplyr); library(splancs); library(spatstat); library(magrittr); library(aspace); library(spdep);library(dplyr);library(spatialreg)
setwd("C:/Users/ruoyu/Desktop/2019 Spring/Spatial Analysis/project")
options(scipen = 7)
MRT = readOGR(dsn = "./data", layer = "MRT", encoding="unicode")
TPE = readOGR(dsn = "./data", layer = "TPE_LI", encoding = "UTF-8")
TPE = spTransform(TPE, MRT@proj4string)
referendum = read.csv("./data/ten_referendum.csv", fileEncoding = "UTF-8", sep = ",")
population = read.csv("./data/li_population.csv", fileEncoding = "UTF-8") %>% filter(性別 == "計") %>%
mutate(
"above_65" = (X65到69歲+X70到74歲+X75到79歲+X80到84歲+X85到89歲+X90到94歲+X95到99歲+X100歲以上)
) %>% select(1:2,26)
population$村里 = as.character(population$村里)
population[52,]$村里 = c("富臺里")
merge = left_join(referendum, y = population, by = c("鄉鎮市區", "村里"))
owo = as.data.frame(TPE)
TPE_join = merge(owo, merge, by.x = c("TOWN", "VILLAGE"), by.y = c("鄉鎮市區", "村里"), all = T)
TPE_join$agree_percent = round(TPE_join$同意票數/ TPE_join$投票數 * 100, 2)
TPE_join$above_65_per = round((TPE_join$above_65) / TPE_join$人口總數 * 100, 2)
TPE_join$income_medium = TPE_join$所得中位數 / 10
TPE$agree_percent = rep(NA,456); TPE$income_medium = rep(NA,456); TPE$above_65_per = rep(NA,456)
for (i in 1:nrow(owo)){
for (j in 1:nrow(owo)){
if(TPE$V_ID[j] == TPE_join$V_ID[i]) {
TPE$agree_percent[j] = TPE_join$agree_percent[i]
TPE$income_medium[j] = TPE_join$income_medium[i]
TPE$above_65_per[j] = TPE_join$above_65_per[i]
}
}
}
# create neighbor matrix
TPE.nb = poly2nb(TPE)
TPE.nb.w = nb2listw(TPE.nb)
I. 描述性圖表
# boxplot
par(mfrow = c(1,3))
boxplot(TPE$agree_percent, main = "同意百分比")
boxplot(TPE$above_65_per, main = "老年人口百分比")
boxplot(TPE$income_medium, main = "年收入中位數(萬)")
# choropleth for variables included in the project
par(mfrow = c(1,1))
vacant.shades = auto.shading(TPE$agree_percent, n = 5, cols = brewer.pal(5,"YlOrRd"), cutter = quantileCuts)
choropleth(TPE, TPE$agree_percent, main ="台北市465里第十案公投同意百分比面量圖", shading = vacant.shades)
choro.legend(317907, 2786395, vacant.shades)
vacant.shades.65 = auto.shading(TPE$above_65_per, n = 5, cols = brewer.pal(5, "Blues"), cutter = quantileCuts)
choropleth(TPE, TPE$above_65_per, main = "台北市465里65歲以上人口百分比面量圖", shading = vacant.shades.65)
choro.legend(317907, 2786395, vacant.shades.65)
vacant.shades.income = auto.shading(TPE$income_medium, n = 5, cols = brewer.pal(5, "Greens"), cutter = quantileCuts)
choropleth(TPE, TPE$income_medium, main ="台北市465里年收入中位數(萬)面量圖", shading = vacant.shades.income)
choro.legend(317907, 2786395, vacant.shades.income)
- 空間自相關 (全域)
### Moran's I ###
# 1. Moran's I coefficient
M = moran.test(TPE$agree_percent, listw = TPE.nb.w); M
##
## Moran I test under randomisation
##
## data: TPE$agree_percent
## weights: TPE.nb.w
##
## Moran I statistic standard deviate = 9.4897, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.2554569734 -0.0021978022 0.0007371812
# 2. Monte-Carlo simulation
mc = moran.mc(TPE$agree_percent, listw = TPE.nb.w, nsim = 999)
hist(mc$res, main = "Monte-Carlo simulation", xlab = "Simulated Moran's I")
abline(v = M$estimate[1], col = "red")
# 3. Moran scatter plot
moran.plot(TPE$agree_percent, TPE.nb.w, main ="Moran scatter plot", xlab = "Agree percentage", ylab = "Agree percentage (spatially lagged)")
# 4. Correlogram
cor = sp.correlogram(TPE.nb, TPE$agree_percent, order = 10, method = "I", style = "W", zero.policy = T)
print(cor); plot(cor, ylab = "Moran's I", xlab = "lags", main = "Correlogram for Agree Percentage")
## Spatial correlogram for TPE$agree_percent
## method: Moran's I
## estimate expectation variance standard deviate
## 1 (456) 0.255456973 -0.002197802 0.000737181 9.4897
## 2 (456) 0.126043013 -0.002197802 0.000332154 7.0365
## 3 (456) 0.057000084 -0.002197802 0.000201633 4.1689
## 4 (456) 0.019411137 -0.002197802 0.000144072 1.8003
## 5 (456) -0.002847158 -0.002197802 0.000113509 -0.0609
## 6 (456) -0.016423767 -0.002197802 0.000095960 -1.4522
## 7 (456) -0.031617159 -0.002197802 0.000086085 -3.1708
## 8 (456) -0.047881321 -0.002197802 0.000082374 -5.0334
## 9 (456) -0.027782346 -0.002197802 0.000087375 -2.7371
## 10 (456) -0.031501377 -0.002197802 0.000101478 -2.9089
## Pr(I) two sided
## 1 (456) < 2.2e-16 ***
## 2 (456) 1.971e-12 ***
## 3 (456) 3.060e-05 ***
## 4 (456) 0.071814 .
## 5 (456) 0.951400
## 6 (456) 0.146438
## 7 (456) 0.001520 **
## 8 (456) 4.818e-07 ***
## 9 (456) 0.006199 **
## 10 (456) 0.003627 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
### General G statistic ###
G = globalG.test(TPE$agree_percent, listw = TPE.nb.w, zero.policy = T); G
##
## Getis-Ord global G statistic
##
## data: TPE$agree_percent
## weights: TPE.nb.w
##
## standard deviate = 0.25597, p-value = 0.399
## alternative hypothesis: greater
## sample estimates:
## Global G statistic Expectation Variance
## 2.198084e-03 2.197802e-03 1.213682e-12
- 空間自相關 (區域)
### Local Moran's I Index (LISA) ###
# 1. LISA without FDR
LISA = localmoran(TPE$agree_percent, TPE.nb.w, zero.policy = T, alternative = "two.sided")
diff = TPE$agree_percent - mean(TPE$agree_percent)
z = LISA[, 4]
quad = c()
quad[diff > 0 & z > 0] = 1 # H-H
quad[diff < 0 & z > 0] = 2 # L-L
quad[diff > 0 & z < 0] = 3 # H-L
quad[diff < 0 & z < 0] = 4 # L-H
quad[LISA[, 5] > 0.05] = 5
colors = c("red", "blue", "lightpink", "skyblue2", rgb(.95, .95, .95))
plot(TPE, border = "grey", col = colors[quad], main = "台北市465里第十案公投同意百分比 Lisa Map \n(未經FDR校正)")
legend("bottomright",legend = c("High-High","Low-Low","High-Low","Low-High","Not Significant"), fill = colors,bty = "n",cex = 0.7, y.intersp = 1, x.intersp = 1)
# 2 LISA after FDR adjustion
LISA.fdr = localmoran(TPE$agree_percent, TPE.nb.w, zero.policy = T, alternative = "two.sided", p.adjust.method = "fdr")
diff = TPE$agree_percent - mean(TPE$agree_percent)
z = LISA.fdr[, 4]
quad = c()
quad[diff > 0 & z > 0] = 1 # H-H
quad[diff < 0 & z > 0] = 2 # L-L
quad[diff > 0 & z < 0] = 3 # H-L
quad[diff < 0 & z < 0] = 4 # L-H
quad[LISA.fdr[, 5] > 0.05] = 5
plot(TPE, border = "grey", col = colors[quad], main = "台北市465里第十案公投同意百分比 Lisa Map \n(FDR 校正後)")
legend("bottomright", legend = c("High-High","Low-Low","High-Low","Low-High","Not Significant"), fill = colors,bty = "n",cex = 0.7, y.intersp = 1, x.intersp = 1)
### Local G-statistic (Gi*) ###
TPE.nb.in = include.self(TPE.nb)
TPE.nb.in.w = nb2listw(TPE.nb.in, zero.policy=T)
LG = as.vector(localG(TPE$agree_percent, TPE.nb.in.w))
TPE$LG = LG
# 1. p-vlaues of Gi* (for agree_percent)
# use Monte-Carlo simulation to calculate the p-values of Gi*
sim.G = matrix(NA, nrow = 9999, ncol = length(LG))
for(i in 1:9999) sim.G[i,] = localG(sample(TPE$agree_percent), TPE.nb.in.w)
p.val = (colSums(sweep(sim.G,2,LG,'>'))+1)/(nrow(sim.G)+1)
TPE$p = p.val
col = colorRampPalette(c("red","orange","white"), space = "rgb")
spplot(TPE, zcol = "p", col.regions = col(20), main = "台北市465里第十案公投同意百分比\np-values of Gi* values(單尾)")
# 2. statistically significant villages (p-value < 0.05)
quad = rep(2,length(LG)) # Non-cluster
quad[p.val < 0.05] = 1 # Cluster
colors = c("red", rgb(.95, .95, .95))
plot(TPE, border = "darkgrey", col = colors[quad], main = "Cluster Map (Gi*)\n台北市465里第十案公投同意百分比")
legend("bottomright", legend = c("Cluster","Non-cluster"), fill = colors, bty = "n",cex = 0.7,y.intersp = 1,x.intersp = 1)
# 3. statistically significant villages after FDR adjustion (p-value < 0.05)
p.fdr = p.adjust(p.val, "fdr")
quad = rep(2, length(LG)) # Non-cluster
quad[p.fdr < 0.05] = 1 # Cluster
plot(TPE, border="darkgrey", col=colors[quad], main = "Cluster Map (Gi* after SP FDR)\n台北市465里第十案公投同意百分比")
legend("bottomright", legend = c("Cluster", "Non-cluster"), fill = colors, bty = "n", cex = 0.7, y.intersp = 1, x.intersp = 1)
- 空間與非空間迴歸
在顯著水準 0.05 下,兩個自變項皆達顯著,六十五歲以上人口百分比高、年收入中位低的地區,該里同意第十案公投的百分比(應變項)較高,Adjusted R-squared 為 0.13,仍有許多未被解釋的變異。
基於前面顯著空間自相關、目前 OLS 模型解釋力不足等原因,考慮跑空間迴歸模型,先使用 lm.LMtests,了解若使用 SLX Spatially Lagged X (Local Spatial Model) 或 Spatial Error Model (SEM) 是否可能更好地去解釋這筆資料的變異,檢定結果建議使用 SEM model,考慮該模型所代表的意涵在此筆資料中也算合理,將在下一小節嘗試 SEM 模型。
# regression formula for the following analysis
reg.eq = agree_percent ~ above_65_per + income_medium
### OLS ###
# y = X*beta + varepsilon
reg1 = lm(reg.eq, data = TPE)
summary(reg1)
##
## Call:
## lm(formula = reg.eq, data = TPE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.4518 -1.2933 -0.1633 1.1454 8.3108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.625892 0.830642 80.210 < 2e-16 ***
## above_65_per 0.077202 0.031493 2.451 0.0146 *
## income_medium -0.061084 0.007913 -7.719 7.56e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.056 on 453 degrees of freedom
## Multiple R-squared: 0.1324, Adjusted R-squared: 0.1286
## F-statistic: 34.57 on 2 and 453 DF, p-value: 1.063e-14
# check whether it's needed to run spatial models
lm.LMtests(reg1, TPE.nb.w, test=c("LMerr", "LMlag", "RLMerr", "RLMlag"))
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
##
## LMerr = 69.389, df = 1, p-value < 2.2e-16
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
##
## LMlag = 60.635, df = 1, p-value = 6.883e-15
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
##
## RLMerr = 8.8135, df = 1, p-value = 0.00299
##
##
## Lagrange multiplier diagnostics for spatial dependence
##
## data:
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
##
## RLMlag = 0.059281, df = 1, p-value = 0.8076
Lambda = 0.47425, p-value < 0.05,拒絕虛無假設 (lambda = 0),spatial autoregressive coefficient (lambda) 不為零。
### Spatial Error Model (global) ###
# y = X*beta + u, u = lambda*W*u + e
reg2 = errorsarlm(reg.eq, data = TPE, TPE.nb.w)
summary(reg2)
##
## Call:errorsarlm(formula = reg.eq, data = TPE, listw = TPE.nb.w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.67237 -1.18930 -0.13015 1.02401 8.56800
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 66.1467248 1.0683275 61.9161 < 2.2e-16
## above_65_per 0.1043818 0.0386251 2.7024 0.006883
## income_medium -0.0606780 0.0095104 -6.3801 0.0000000001769
##
## Lambda: 0.47425, LR test value: 54.03, p-value: 1.9751e-13
## Asymptotic standard error: 0.059442
## z-value: 7.9783, p-value: 1.5543e-15
## Wald statistic: 63.653, p-value: 1.4433e-15
##
## Log likelihood: -947.1247 for error model
## ML residual variance (sigma squared): 3.5708, (sigma: 1.8897)
## Number of observations: 456
## Number of parameters estimated: 5
## AIC: 1904.2, (AIC for lm: 1956.3)
因為也可能鄰居對同婚議題的支持度會直接對自己的支持度有所影響,也考慮另一種試圖檢驗這點的 SAR spatial lag model。
檢定結果 Rho = 0.43151, LR test value = 48.005, p-value < 0.05,拒絕虛無假設,有顯著受鄰居影響的效果。最後,因為有空間的 spillover,為了能更好詮釋迴歸係數的意涵,進行 Monte-Carlo 模擬得到 direct/indirect effect 的 p-value,各項皆顯著 (< 0.05) (因為是 MC 模擬,每次結果數值會略有差異)
### Spatial Lag Model (global) ###
# y = rho*W*y + x*beta + varepsilon
reg3 = lagsarlm(reg.eq, data = TPE, TPE.nb.w)
summary(reg3)
##
## Call:lagsarlm(formula = reg.eq, data = TPE, listw = TPE.nb.w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.87700 -1.19234 -0.14193 1.09604 8.51271
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 38.3109957 3.9279144 9.7535 < 2.2e-16
## above_65_per 0.0766536 0.0293684 2.6101 0.009052
## income_medium -0.0478162 0.0077651 -6.1578 0.0000000007375
##
## Rho: 0.43151, LR test value: 48.005, p-value: 4.2519e-12
## Asymptotic standard error: 0.058511
## z-value: 7.3748, p-value: 1.6454e-13
## Wald statistic: 54.387, p-value: 1.6465e-13
##
## Log likelihood: -950.1371 for lag model
## ML residual variance (sigma squared): 3.6481, (sigma: 1.91)
## Number of observations: 456
## Number of parameters estimated: 5
## AIC: 1910.3, (AIC for lm: 1956.3)
## LM test for residual autocorrelation
## test value: 1.6487, p-value: 0.19913
# impacts(reg3,listw = TPE.nb.w)
# These pvalues are simulated, and seem to vary a bit from run to run.
summary(impacts(reg3, listw = TPE.nb.w, R = 500),zstats = TRUE)
## Impact measures (lag, exact):
## Direct Indirect Total
## above_65_per 0.07961824 0.05521863 0.13483687
## income_medium -0.04966553 -0.03444515 -0.08411068
## ========================================================
## Simulation results (asymptotic variance matrix):
## Direct:
##
## Iterations = 1:500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 500
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## above_65_per 0.08017 0.030809 0.0013778 0.0013778
## income_medium -0.04900 0.007887 0.0003527 0.0003981
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## above_65_per 0.01540 0.05961 0.08130 0.09981 0.13580
## income_medium -0.06492 -0.05413 -0.04928 -0.04371 -0.03371
##
## ========================================================
## Indirect:
##
## Iterations = 1:500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 500
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## above_65_per 0.05589 0.024543 0.001098 0.001098
## income_medium -0.03416 0.008341 0.000373 0.000373
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## above_65_per 0.01151 0.03978 0.05444 0.06994 0.10987
## income_medium -0.05296 -0.03916 -0.03340 -0.02827 -0.02069
##
## ========================================================
## Total:
##
## Iterations = 1:500
## Thinning interval = 1
## Number of chains = 1
## Sample size per chain = 500
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## above_65_per 0.13606 0.05329 0.0023832 0.0023832
## income_medium -0.08316 0.01398 0.0006251 0.0007325
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## above_65_per 0.02691 0.10010 0.13734 0.1686 0.23861
## income_medium -0.11190 -0.09198 -0.08299 -0.0733 -0.05761
##
## ========================================================
## Simulated standard errors
## Direct Indirect Total
## above_65_per 0.030808632 0.024542582 0.05328953
## income_medium 0.007886807 0.008341043 0.01397662
##
## Simulated z-values:
## Direct Indirect Total
## above_65_per 2.602044 2.277348 2.553173
## income_medium -6.213413 -4.094841 -5.949882
##
## Simulated p-values:
## Direct Indirect Total
## above_65_per 0.009267 0.022765 0.010675
## income_medium 0.00000000051846 0.000042246 0.0000000026834