在同婚專法通過之前──台北市456里公投第十案空間分析

空間分析 期末專題

B03207009 呂方雯

初步環境建置與讀取檔案、資料處理、設定鄰居

rm(list = ls())
library(rgdal); library(GISTools); library(ggtern); library(dplyr); library(splancs); library(spatstat); library(magrittr); library(aspace); library(spdep);library(dplyr);library(spatialreg)
setwd("C:/Users/ruoyu/Desktop/2019 Spring/Spatial Analysis/project")
options(scipen = 7)
MRT = readOGR(dsn = "./data", layer = "MRT", encoding="unicode")
TPE = readOGR(dsn = "./data", layer = "TPE_LI", encoding = "UTF-8")
TPE = spTransform(TPE, MRT@proj4string)
referendum = read.csv("./data/ten_referendum.csv", fileEncoding = "UTF-8", sep = ",")
population = read.csv("./data/li_population.csv", fileEncoding = "UTF-8") %>% filter(性別 == "計") %>%
  mutate(
    "above_65" = (X65到69歲+X70到74歲+X75到79歲+X80到84歲+X85到89歲+X90到94歲+X95到99歲+X100歲以上)
  ) %>% select(1:2,26)
population$村里 = as.character(population$村里)
population[52,]$村里 = c("富臺里")
merge = left_join(referendum, y = population, by = c("鄉鎮市區", "村里"))

owo = as.data.frame(TPE)
TPE_join = merge(owo, merge, by.x = c("TOWN", "VILLAGE"), by.y = c("鄉鎮市區", "村里"), all = T)
TPE_join$agree_percent = round(TPE_join$同意票數/ TPE_join$投票數 * 100, 2)
TPE_join$above_65_per  = round((TPE_join$above_65) / TPE_join$人口總數 * 100, 2) 
TPE_join$income_medium = TPE_join$所得中位數 / 10

TPE$agree_percent = rep(NA,456); TPE$income_medium = rep(NA,456); TPE$above_65_per = rep(NA,456)
for (i in 1:nrow(owo)){
  for (j in 1:nrow(owo)){
    if(TPE$V_ID[j] == TPE_join$V_ID[i]) {
      TPE$agree_percent[j] = TPE_join$agree_percent[i]
      TPE$income_medium[j] = TPE_join$income_medium[i]
      TPE$above_65_per[j] = TPE_join$above_65_per[i]
    }
  }
} 

# create neighbor matrix
TPE.nb   = poly2nb(TPE)
TPE.nb.w = nb2listw(TPE.nb)

I. 描述性圖表

使用箱形圖了解本筆資料主要興趣變項(公投第十案同意百分比)以及最後迴歸會用到的變項(65歲以上人口百分比、所得中位數) 的分布及極值狀況

使用面量圖了解各里在主要興趣變項(公投第十案同意百分比)以及最後迴歸會用到的變項(65歲以上人口百分比、所得中位數) 的表現狀況。

# boxplot
par(mfrow = c(1,3))
boxplot(TPE$agree_percent, main = "同意百分比")
boxplot(TPE$above_65_per,  main = "老年人口百分比")
boxplot(TPE$income_medium, main = "年收入中位數(萬)")

# choropleth for variables included in the project
par(mfrow = c(1,1))
vacant.shades = auto.shading(TPE$agree_percent, n = 5, cols = brewer.pal(5,"YlOrRd"), cutter = quantileCuts)
choropleth(TPE, TPE$agree_percent, main ="台北市465里第十案公投同意百分比面量圖", shading = vacant.shades)
choro.legend(317907, 2786395, vacant.shades)

vacant.shades.65 = auto.shading(TPE$above_65_per, n = 5, cols = brewer.pal(5, "Blues"), cutter = quantileCuts) 
choropleth(TPE, TPE$above_65_per, main = "台北市465里65歲以上人口百分比面量圖", shading = vacant.shades.65)
choro.legend(317907, 2786395, vacant.shades.65)

vacant.shades.income = auto.shading(TPE$income_medium, n = 5, cols = brewer.pal(5, "Greens"), cutter = quantileCuts) 
choropleth(TPE, TPE$income_medium, main ="台北市465里年收入中位數(萬)面量圖", shading = vacant.shades.income)
choro.legend(317907, 2786395, vacant.shades.income)

  1. 空間自相關 (全域)

感興趣的研究變項為各里第十案公投同意百分比,此區進行的檢定顯著水準皆為 0.05

Monte-Carlo 模擬結果有顯著差異,拒絕無空間自相關的虛無假設,接受 H1(有空間自相關)。

General G statistic 換算成 Z 值後得到的 p 值 > 0.05,無法拒絕 H0。

### Moran's I ###
# 1. Moran's I coefficient
M = moran.test(TPE$agree_percent, listw = TPE.nb.w); M
## 
##  Moran I test under randomisation
## 
## data:  TPE$agree_percent  
## weights: TPE.nb.w    
## 
## Moran I statistic standard deviate = 9.4897, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      0.2554569734     -0.0021978022      0.0007371812
# 2. Monte-Carlo simulation
mc = moran.mc(TPE$agree_percent, listw = TPE.nb.w, nsim = 999)
hist(mc$res, main = "Monte-Carlo simulation", xlab = "Simulated Moran's I")
abline(v = M$estimate[1], col = "red")

# 3. Moran scatter plot
moran.plot(TPE$agree_percent, TPE.nb.w, main ="Moran scatter plot", xlab = "Agree percentage", ylab = "Agree percentage (spatially lagged)")

# 4. Correlogram
cor = sp.correlogram(TPE.nb, TPE$agree_percent, order = 10, method = "I", style = "W", zero.policy = T)
print(cor); plot(cor, ylab = "Moran's I", xlab = "lags", main = "Correlogram for Agree Percentage")
## Spatial correlogram for TPE$agree_percent 
## method: Moran's I
##              estimate  expectation     variance standard deviate
## 1 (456)   0.255456973 -0.002197802  0.000737181           9.4897
## 2 (456)   0.126043013 -0.002197802  0.000332154           7.0365
## 3 (456)   0.057000084 -0.002197802  0.000201633           4.1689
## 4 (456)   0.019411137 -0.002197802  0.000144072           1.8003
## 5 (456)  -0.002847158 -0.002197802  0.000113509          -0.0609
## 6 (456)  -0.016423767 -0.002197802  0.000095960          -1.4522
## 7 (456)  -0.031617159 -0.002197802  0.000086085          -3.1708
## 8 (456)  -0.047881321 -0.002197802  0.000082374          -5.0334
## 9 (456)  -0.027782346 -0.002197802  0.000087375          -2.7371
## 10 (456) -0.031501377 -0.002197802  0.000101478          -2.9089
##          Pr(I) two sided    
## 1 (456)        < 2.2e-16 ***
## 2 (456)        1.971e-12 ***
## 3 (456)        3.060e-05 ***
## 4 (456)         0.071814 .  
## 5 (456)         0.951400    
## 6 (456)         0.146438    
## 7 (456)         0.001520 ** 
## 8 (456)        4.818e-07 ***
## 9 (456)         0.006199 ** 
## 10 (456)        0.003627 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

### General G statistic  ###
G = globalG.test(TPE$agree_percent, listw = TPE.nb.w, zero.policy = T); G
## 
##  Getis-Ord global G statistic
## 
## data:  TPE$agree_percent 
## weights: TPE.nb.w 
## 
## standard deviate = 0.25597, p-value = 0.399
## alternative hypothesis: greater
## sample estimates:
## Global G statistic        Expectation           Variance 
##       2.198084e-03       2.197802e-03       1.213682e-12
  1. 空間自相關 (區域)

感興趣的研究變項為各里第十案公投同意百分比,此區進行的檢定顯著水準皆為 0.05,將有顯著之結果標示於圖上,可參考每張圖的圖例了解顏色對應的資訊。

以下針對 LISA 與 Gi* 分別呈現經過 FDR 校正前後的結果地圖。

### Local Moran's I Index (LISA) ###
# 1. LISA without FDR
LISA = localmoran(TPE$agree_percent, TPE.nb.w, zero.policy = T, alternative = "two.sided")
diff = TPE$agree_percent - mean(TPE$agree_percent)
z = LISA[, 4]
quad = c()
quad[diff > 0 & z > 0] = 1 # H-H
quad[diff < 0 & z > 0] = 2 # L-L
quad[diff > 0 & z < 0] = 3 # H-L
quad[diff < 0 & z < 0] = 4 # L-H
quad[LISA[, 5] > 0.05] = 5

colors = c("red", "blue", "lightpink", "skyblue2", rgb(.95, .95, .95))
plot(TPE, border = "grey", col = colors[quad], main = "台北市465里第十案公投同意百分比 Lisa Map \n(未經FDR校正)")
legend("bottomright",legend = c("High-High","Low-Low","High-Low","Low-High","Not Significant"), fill = colors,bty = "n",cex = 0.7, y.intersp = 1, x.intersp = 1)

# 2 LISA after FDR adjustion
LISA.fdr = localmoran(TPE$agree_percent, TPE.nb.w, zero.policy = T, alternative = "two.sided", p.adjust.method = "fdr")
diff = TPE$agree_percent - mean(TPE$agree_percent)
z    = LISA.fdr[, 4]
quad = c()
quad[diff > 0 & z > 0] = 1 # H-H
quad[diff < 0 & z > 0] = 2 # L-L
quad[diff > 0 & z < 0] = 3 # H-L
quad[diff < 0 & z < 0] = 4 # L-H
quad[LISA.fdr[, 5] > 0.05] = 5
plot(TPE, border = "grey", col = colors[quad], main = "台北市465里第十案公投同意百分比 Lisa Map \n(FDR 校正後)")
legend("bottomright", legend = c("High-High","Low-Low","High-Low","Low-High","Not Significant"), fill = colors,bty = "n",cex = 0.7, y.intersp = 1, x.intersp = 1)

### Local G-statistic (Gi*) ###
TPE.nb.in   = include.self(TPE.nb)
TPE.nb.in.w = nb2listw(TPE.nb.in, zero.policy=T)
LG = as.vector(localG(TPE$agree_percent, TPE.nb.in.w))
TPE$LG = LG

# 1. p-vlaues of Gi* (for agree_percent)
# use Monte-Carlo simulation to calculate the p-values of Gi*
sim.G = matrix(NA, nrow = 9999, ncol = length(LG))
for(i in 1:9999) sim.G[i,] = localG(sample(TPE$agree_percent), TPE.nb.in.w)
p.val = (colSums(sweep(sim.G,2,LG,'>'))+1)/(nrow(sim.G)+1)
TPE$p = p.val

col = colorRampPalette(c("red","orange","white"), space = "rgb")
spplot(TPE, zcol = "p", col.regions = col(20), main = "台北市465里第十案公投同意百分比\np-values of Gi* values(單尾)")

# 2. statistically significant villages (p-value < 0.05)
quad = rep(2,length(LG)) # Non-cluster
quad[p.val < 0.05] = 1 # Cluster

colors = c("red", rgb(.95, .95, .95))
plot(TPE, border = "darkgrey", col = colors[quad], main = "Cluster Map (Gi*)\n台北市465里第十案公投同意百分比")
legend("bottomright", legend = c("Cluster","Non-cluster"), fill = colors, bty = "n",cex = 0.7,y.intersp = 1,x.intersp = 1)

# 3. statistically significant villages after FDR adjustion (p-value < 0.05)
p.fdr = p.adjust(p.val, "fdr")
quad  = rep(2, length(LG)) # Non-cluster
quad[p.fdr < 0.05] = 1 # Cluster

plot(TPE, border="darkgrey", col=colors[quad], main = "Cluster Map (Gi* after SP FDR)\n台北市465里第十案公投同意百分比")
legend("bottomright", legend = c("Cluster", "Non-cluster"), fill = colors, bty = "n", cex = 0.7, y.intersp = 1, x.intersp = 1)

  1. 空間與非空間迴歸

【OLS】

迴歸式:\(agree.percent = b_1 * above.65.percent + b_2 * income.medium + \varepsilon\)

在顯著水準 0.05 下,兩個自變項皆達顯著,六十五歲以上人口百分比高、年收入中位低的地區,該里同意第十案公投的百分比(應變項)較高,Adjusted R-squared 為 0.13,仍有許多未被解釋的變異。
基於前面顯著空間自相關、目前 OLS 模型解釋力不足等原因,考慮跑空間迴歸模型,先使用 lm.LMtests,了解若使用 SLX Spatially Lagged X (Local Spatial Model) 或 Spatial Error Model (SEM) 是否可能更好地去解釋這筆資料的變異,檢定結果建議使用 SEM model,考慮該模型所代表的意涵在此筆資料中也算合理,將在下一小節嘗試 SEM 模型。

# regression formula for the following analysis 
reg.eq = agree_percent ~ above_65_per + income_medium

### OLS ###
# y = X*beta + varepsilon
reg1 = lm(reg.eq, data = TPE)
summary(reg1)
## 
## Call:
## lm(formula = reg.eq, data = TPE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4518 -1.2933 -0.1633  1.1454  8.3108 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   66.625892   0.830642  80.210  < 2e-16 ***
## above_65_per   0.077202   0.031493   2.451   0.0146 *  
## income_medium -0.061084   0.007913  -7.719 7.56e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.056 on 453 degrees of freedom
## Multiple R-squared:  0.1324, Adjusted R-squared:  0.1286 
## F-statistic: 34.57 on 2 and 453 DF,  p-value: 1.063e-14
# check whether it's needed to run spatial models
lm.LMtests(reg1, TPE.nb.w, test=c("LMerr", "LMlag", "RLMerr", "RLMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
## 
## LMerr = 69.389, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
## 
## LMlag = 60.635, df = 1, p-value = 6.883e-15
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
## 
## RLMerr = 8.8135, df = 1, p-value = 0.00299
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = reg.eq, data = TPE)
## weights: TPE.nb.w
## 
## RLMlag = 0.059281, df = 1, p-value = 0.8076

【SEM】

迴歸式: \(agree.percent = {\beta}X + u, u = {\lambda}W*u + e\)    

Lambda = 0.47425, p-value < 0.05,拒絕虛無假設 (lambda = 0),spatial autoregressive coefficient (lambda) 不為零。

### Spatial Error Model (global) ###
# y = X*beta + u, u = lambda*W*u + e
reg2 = errorsarlm(reg.eq, data = TPE, TPE.nb.w)
summary(reg2)
## 
## Call:errorsarlm(formula = reg.eq, data = TPE, listw = TPE.nb.w)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6.67237 -1.18930 -0.13015  1.02401  8.56800 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                 Estimate Std. Error z value        Pr(>|z|)
## (Intercept)   66.1467248  1.0683275 61.9161       < 2.2e-16
## above_65_per   0.1043818  0.0386251  2.7024        0.006883
## income_medium -0.0606780  0.0095104 -6.3801 0.0000000001769
## 
## Lambda: 0.47425, LR test value: 54.03, p-value: 1.9751e-13
## Asymptotic standard error: 0.059442
##     z-value: 7.9783, p-value: 1.5543e-15
## Wald statistic: 63.653, p-value: 1.4433e-15
## 
## Log likelihood: -947.1247 for error model
## ML residual variance (sigma squared): 3.5708, (sigma: 1.8897)
## Number of observations: 456 
## Number of parameters estimated: 5 
## AIC: 1904.2, (AIC for lm: 1956.3)

【SAR】

迴歸式: \(agree.percent = {\beta}X + {\rho}W*agree.percent + {\varepsilon}\)   

因為也可能鄰居對同婚議題的支持度會直接對自己的支持度有所影響,也考慮另一種試圖檢驗這點的 SAR spatial lag model。
檢定結果 Rho = 0.43151, LR test value = 48.005, p-value < 0.05,拒絕虛無假設,有顯著受鄰居影響的效果。最後,因為有空間的 spillover,為了能更好詮釋迴歸係數的意涵,進行 Monte-Carlo 模擬得到 direct/indirect effect 的 p-value,各項皆顯著 (< 0.05) (因為是 MC 模擬,每次結果數值會略有差異)

### Spatial Lag Model (global) ###
# y = rho*W*y + x*beta + varepsilon
reg3 = lagsarlm(reg.eq, data = TPE, TPE.nb.w)
summary(reg3)
## 
## Call:lagsarlm(formula = reg.eq, data = TPE, listw = TPE.nb.w)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -6.87700 -1.19234 -0.14193  1.09604  8.51271 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                 Estimate Std. Error z value        Pr(>|z|)
## (Intercept)   38.3109957  3.9279144  9.7535       < 2.2e-16
## above_65_per   0.0766536  0.0293684  2.6101        0.009052
## income_medium -0.0478162  0.0077651 -6.1578 0.0000000007375
## 
## Rho: 0.43151, LR test value: 48.005, p-value: 4.2519e-12
## Asymptotic standard error: 0.058511
##     z-value: 7.3748, p-value: 1.6454e-13
## Wald statistic: 54.387, p-value: 1.6465e-13
## 
## Log likelihood: -950.1371 for lag model
## ML residual variance (sigma squared): 3.6481, (sigma: 1.91)
## Number of observations: 456 
## Number of parameters estimated: 5 
## AIC: 1910.3, (AIC for lm: 1956.3)
## LM test for residual autocorrelation
## test value: 1.6487, p-value: 0.19913
# impacts(reg3,listw = TPE.nb.w)
# These pvalues are simulated, and seem to vary a bit from run to run.
summary(impacts(reg3, listw = TPE.nb.w, R = 500),zstats = TRUE) 
## Impact measures (lag, exact):
##                    Direct    Indirect       Total
## above_65_per   0.07961824  0.05521863  0.13483687
## income_medium -0.04966553 -0.03444515 -0.08411068
## ========================================================
## Simulation results (asymptotic variance matrix):
## Direct:
## 
## Iterations = 1:500
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 500 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean       SD  Naive SE Time-series SE
## above_65_per   0.08017 0.030809 0.0013778      0.0013778
## income_medium -0.04900 0.007887 0.0003527      0.0003981
## 
## 2. Quantiles for each variable:
## 
##                   2.5%      25%      50%      75%    97.5%
## above_65_per   0.01540  0.05961  0.08130  0.09981  0.13580
## income_medium -0.06492 -0.05413 -0.04928 -0.04371 -0.03371
## 
## ========================================================
## Indirect:
## 
## Iterations = 1:500
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 500 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean       SD Naive SE Time-series SE
## above_65_per   0.05589 0.024543 0.001098       0.001098
## income_medium -0.03416 0.008341 0.000373       0.000373
## 
## 2. Quantiles for each variable:
## 
##                   2.5%      25%      50%      75%    97.5%
## above_65_per   0.01151  0.03978  0.05444  0.06994  0.10987
## income_medium -0.05296 -0.03916 -0.03340 -0.02827 -0.02069
## 
## ========================================================
## Total:
## 
## Iterations = 1:500
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 500 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                   Mean      SD  Naive SE Time-series SE
## above_65_per   0.13606 0.05329 0.0023832      0.0023832
## income_medium -0.08316 0.01398 0.0006251      0.0007325
## 
## 2. Quantiles for each variable:
## 
##                   2.5%      25%      50%     75%    97.5%
## above_65_per   0.02691  0.10010  0.13734  0.1686  0.23861
## income_medium -0.11190 -0.09198 -0.08299 -0.0733 -0.05761
## 
## ========================================================
## Simulated standard errors
##                    Direct    Indirect      Total
## above_65_per  0.030808632 0.024542582 0.05328953
## income_medium 0.007886807 0.008341043 0.01397662
## 
## Simulated z-values:
##                  Direct  Indirect     Total
## above_65_per   2.602044  2.277348  2.553173
## income_medium -6.213413 -4.094841 -5.949882
## 
## Simulated p-values:
##               Direct           Indirect    Total          
## above_65_per  0.009267         0.022765    0.010675       
## income_medium 0.00000000051846 0.000042246 0.0000000026834