Method

Load required packages:

library(sf) # spatial data manipulation
library(data.table) # fast data manipulation
library(leaflet) # easy mapping in R
library(ggplot2) # plotting
library(rpart); library(rpart.plot) # decision tree models
library(MASS); library(pscl) # Poisson and zero-inflated regression

source("R/summary_plots.R") # function for creating summary plots

options(scipen = 99) # disable scientific notation

Load Data

Road Segments and Traffic: A shapefile of Pennsylvania traffic data has been prepared from PennDOT’s RMSTRAFFIC data, retaining only primary state routes. The shapefile is read into R as a spatial object using st_read function from the sf package:

segments <- st_read("shp/traffic/traffic.shp")

## Reading layer `traffic' from data source `/Users/markegge/projects/lighting_safety_r_demo/shp/traffic/traffic.shp' using driver `ESRI Shapefile'
## Simple feature collection with 15866 features and 8 fields (with 1 geometry empty)
## geometry type:  MULTILINESTRING
## dimension:      XY
## bbox:           xmin: 1191414 ymin: 141037.8 xmax: 2807622 ymax: 1075345
## CRS:            2272

segments$id <- 1:nrow(segments) # assign a unique identifier for each segment

Crashes: Having downloaded the PA Crash data files for 2012 – 2016 (five years is the typical period of analysis for safety investigations), the CSV files are loaded into R using the fread function from the data.table package. Compared with read.csv, fread is much faster, and has sensible defaults (e.g. stringsAsFactors = FALSE). A list object with each year’s data is populated, then the rbindlist from data.table binds together the rows from each object in the list.

crashes <- list()
for(year in as.character(2012:2016)) {
  crashes[[year]] <- fread(paste0("data/Statewide_", year, "/CRASH_", year, "_Statewide.csv"))
}
crashes <- rbindlist(crashes) # join the five files into one

The result is a dataset called crashes with 627180 observations.

We then run the provided summary_plots function to generate a PDF with a descriptive plot of each attribute in the dataset.

## Plots generated to  out/plots.pdf .

Take a moment to inspect the resulting file out/plots.pdf:

What trends do you see in terms of the seasonality of crashes?
What data cleaning step would you need to take if you were analyzing crashes by Hour of Day?

Finally, filter our crashes to those occurring at night, and create a “crash_lighting” attribute, based on the metadata in docs/PA_Crashes_Data_Dictionary.pdf:

# Filter to only crashes with "2 - Dark - No Street Lights" or "3 - Dark - Street Lights"
crashes <- crashes[ILLUMINATION %in% c(2, 3)]
crashes[ILLUMINATION == 2, crash_lighting := "unlit"]
crashes[ILLUMINATION == 3, crash_lighting := "lit"]

Spatially join crashes to road segments

Next, we spatially join crashes to road segments by buffering the road segments by 50 feet and then performing a spatial join.

Note, for a spatial join to execute property, both the source and destination layers must be in the same spatial projection. In this case, we use the EPSG:2272 NAD83 / Pennsylvania South (ft) projection, which measures distances in feet.

The st_as_sf function from the sf package creates a spatial object from the crashes dataset based on the latitude and longitude values in the “DEC_LAT” and “DEC_LONG” fields.

crashes <- crashes[!(is.na(DEC_LONG) | is.na(DEC_LAT))] # keep only records with x, y values
crashes <- st_as_sf(crashes, coords = c("DEC_LONG", "DEC_LAT"), crs = 4326) # create sf spatial object
crashes <- st_transform(crashes, 2272) # reproject data to EPSG:2272

# buffer road segments by 50 feet
segment_buffer <- st_buffer(segments, dist = 50, endCapStyle = "FLAT")

# spatially join crashes to road segment data to crashes
joined <- st_join(segment_buffer, crashes, left = FALSE) # inner join

Finally, we create a new data.table of the aggregate crash counts by road segment, and merge this new attribute back to our segments data:

segment_crashes <- as.data.table(st_drop_geometry(joined)) # convert sf to data.table

crash_counts <- segment_crashes[, .(annual_crashes = .N / 5), by = id] # count annual crashes by segment

# merge crash counts back to segments
segments <- merge(segments, crash_counts, by = "id", all.x = TRUE)

# calculate crash rates - crashes per million VMT
segments[is.na(segments$annual_crashes), ]$annual_crashes <- 0
segments$mvmt <- with(segments, (DLY_VMT * 365) / 1000000) # million annual vehicle miles travelled
segments$rate <- with(segments, annual_crashes / mvmt) # annual crashes per 1m annual vmt

Segment Illumination

Our roadway dataset does not include information about overhead lighting. The section below uses crash data to impute a “lighting” attribute for our road segments, and then uses a decision tree classifier to assign lighting to unknown segments based on known segments.

# count total crashes by segment id and illumination condition
counts <- segment_crashes[, .(crash_count = .N), by = .(id, crash_lighting)]

segment_counts <- dcast(counts, id ~ crash_lighting,  # long to wide transform
                      value.var = "crash_count", fill = 0)         # filling missing values with zero

# assign segment_lighting based on majority of crashes (lit or unlit)
segment_counts[, segment_lighting := ifelse(lit >= unlit, "lit", "unlit")] 

# join the imputed roadway lighting condition to all road segments
light <- merge(as.data.table(st_drop_geometry(segments)), # create data.table from segments attributes
               segment_counts[, .(id, segment_lighting)], # join result, keeping only segment_lighting field
               by = "id", all.x = TRUE) # join by id, keeping all segments

# Table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")

## 
##   lit unlit  <NA> 
##  6081  7452  2333

For segments with unknown lighting_condition, predict which road segments are illuminated using an rpart decision tree:

fit <- rpart(segment_lighting ~ ST_RT_NO + CTY_CODE + DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = light)

# create a confusion matrix to evaluate classifier performance
actual <- light[!is.na(segment_lighting), segment_lighting]
predicted <- predict(fit, type = "class")
(cm <- as.matrix(table(Actual = actual, Predicted = predicted))) # create the confusion matrix

##        Predicted
## Actual   lit unlit
##   lit   4624  1457
##   unlit 1539  5913

accuracy <- sum(diag(cm)) / sum(cm) # number of correctly classified instances per class  / number of instances
p <- rowSums(cm) / sum(cm); q <- colSums(cm) / sum(cm)
kappa <- (accuracy - sum(p * q)) / (1 - sum(p * q)) # kappa score indicates moderate agreement
cat("Model is", round(accuracy * 100), "% accurate with a Kappa score of", round(kappa, 2))

## Model is 78 % accurate with a Kappa score of 0.55

Our decision tree is reasonably accurate (~75%) and the Kappa score (0.55) indicates moderate evidence that our model outperforms a random chance. We can use this model to assign lighting_condition to the unknown segments.

# use decision tree to fill in missing lighting values
predictions <- predict(fit, type = "class", newdata = light[is.na(segment_lighting)])
light[is.na(segment_lighting)]$segment_lighting <- predictions

# Updated table of "lit", "unlit", and "unknown" lighting_conditon attributes
table(light$segment_lighting, useNA = "ifany")

## 
##   lit unlit 
##  7413  8453

# join lighting attribute back to road segments
segments <- merge(segments, light[, .(id, lighting = segment_lighting)], by = "id", all.x = TRUE)

Exploratory Data Analysis

Exploratory data analysis allows the analyst to become more familiar with the data and informs the selection of subsequent modelling or data visualization steps.

# convert spatial object to data.table for easy data manipulation
DT <- as.data.table(st_drop_geometry(segments))
DT <- DT[!is.na(rate)] # several segments have no geom or rate

# plot crash counts vs. vmt
# relationship between crashes and vmt seems fairly linear
ggplot(DT, aes(x = mvmt, y = annual_crashes, color = lighting)) +
  geom_point(alpha = 0.65, size = 1) + ggtitle("Crash Count vs. VMT")

Visualize the illumination conditions and crash rates:

# limit map data to Pittsburgh region and reproject to web mercator for leaflet
leaflet_segments <- segments[which(segments$DISTRICT_N %in% c("11", "12") & !st_is_empty(segments)), ]

fpal <- colorFactor(c("yellow", "brown", "orange"), leaflet_segments$lighting)
qpal <- colorQuantile(palette = "YlOrRd", domain = leaflet_segments$rate, n = 6)

leaflet(data = st_transform(leaflet_segments, 4326)) %>% 
  addProviderTiles(providers$CartoDB.DarkMatter) %>% 
  addPolylines(color = ~fpal(lighting), group = "Lighting") %>%
  addPolylines(color = ~qpal(rate), group = "Crash Rates") %>%  
  addLayersControl(
    overlayGroups = c("Lighting", "Crash Rates"),
    options = layersControlOptions(collapsed = FALSE)
  ) %>% hideGroup("Crash Rates") %>%
  addLegend(pal = fpal, values = ~lighting, title = "Lighting") %>%
  addLegend(pal = qpal, values = ~rate, title = "Crash Rate Quantile")

Decision trees are useful tools to determine which dataset attributes our predictive of our outcome variables of interest

# What explains / predicts crash counts? A decision tree:
fit <- rpart(annual_crashes ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = DT)
rpart.plot(fit)

# What explains / predicts crash rates?
fit <- rpart(rate ~ DISTRICT_N + SEG_LNGTH_ + CUR_AADT + DLY_VMT, data = DT)
rpart.plot(fit)

# box plot of crash rates
boxplot(rate ~ lighting, data = DT[rate < 2])

# are lighting and AADT related?
cor(DT$DLY_VMT, DT$lighting == "lit") # Yes - weak negative correlation

## [1] -0.1466999

# some descriptive plots
hist(DT$rate, main = "Distribution of Crash Rates")

hist(log(DT$rate), main = "Distribution of Log Crash Rates")

hist(DT[rate < 20]$rate, breaks = 50, main = "Distribution of Crash Rates Under 20")

hist(DT$mvmt, main = "Distribution of VMT") # VMT has an exponential distribution

hist(DT$annual_crashes, main = "Distribution of Crash Counts") # annual crashes has similar distribution

hist(log(DT$mvmt), main = "Distribution of Log VMT")

Modeling

Having completed our exploratory data analysis, we see that there’s a reliable relationship between crash counts and VMT. To estimate the lighting relationship, we’ll fit a regression model controlling for VMT.

Count data is typically Poisson or Negative Binomially distributed, so we’ll use an appropriate regression model.

DT[, total_crashes := as.integer(annual_crashes * 5)] # total 5 year crashes for count model (whole numbers)

# fit a Poisson regression model
fit_p <- glm(total_crashes ~ lighting + mvmt, data = DT, family = poisson)
summary(fit_p)

## 
## Call:
## glm(formula = total_crashes ~ lighting + mvmt, family = poisson, 
##     data = DT)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -36.333   -2.400   -1.120    0.743   29.788  
## 
## Coefficients:
##                 Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    2.0005691  0.0040672  491.88 <0.0000000000000002 ***
## lightingunlit -0.3267165  0.0056339  -57.99 <0.0000000000000002 ***
## mvmt           0.0395346  0.0001177  335.83 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 181883  on 15862  degrees of freedom
## Residual deviance: 128483  on 15860  degrees of freedom
## AIC: 176707
## 
## Number of Fisher Scoring iterations: 5

# Dispersion Statistic
sum(resid(fit_p, type = "pearson")^2) / (nrow(DT) - length(coef(fit_p))) # data is 10x overdispersed

## [1] 10.06537

# goodness of fit measure:
1 - pchisq(summary(fit_p)$deviance, summary(fit_p)$df.residual) # model does not fit the data (p < 0.05)

## [1] 0

# negative binomial regression model
fit_nb <- MASS::glm.nb(total_crashes ~ lighting + mvmt, data = DT)
summary(fit_nb)

## 
## Call:
## MASS::glm.nb(formula = total_crashes ~ lighting + mvmt, data = DT, 
##     init.theta = 1.133411182, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.6619  -0.9308  -0.2971   0.3526   5.2944  
## 
## Coefficients:
##                Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    1.650522   0.012224  135.02 <0.0000000000000002 ***
## lightingunlit -0.466783   0.016615  -28.09 <0.0000000000000002 ***
## mvmt           0.110106   0.001062  103.66 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.1334) family taken to be 1)
## 
##     Null deviance: 25894  on 15862  degrees of freedom
## Residual deviance: 17938  on 15860  degrees of freedom
## AIC: 92321
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.1334 
##           Std. Err.:  0.0157 
## 
##  2 x log-likelihood:  -92313.2440

# Dispersion Statistic
sum(resid(fit_nb)^2) / (nrow(DT) - length(coef(fit_nb)) + 1) # still overdispersed (1.13), but not as bad

## [1] 1.130979

# goodness of fit measure:
1 - pchisq(summary(fit_nb)$deviance, summary(fit_nb)$df.residual) # model does not fit the data

## [1] 0

# zero inflated poisson model
fit_zip <- pscl::zeroinfl(total_crashes ~ lighting + mvmt | CUR_AADT, data = DT, dist = "poisson")
summary(fit_zip)

## 
## Call:
## pscl::zeroinfl(formula = total_crashes ~ lighting + mvmt | CUR_AADT, 
##     data = DT, dist = "poisson")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -26.5057  -1.2926  -0.6677   0.5348  49.9248 
## 
## Count model coefficients (poisson with log link):
##                 Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    2.1916595  0.0040869  536.26 <0.0000000000000002 ***
## lightingunlit -0.3916956  0.0056465  -69.37 <0.0000000000000002 ***
## mvmt           0.0370272  0.0001227  301.66 <0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                 Estimate   Std. Error z value            Pr(>|z|)    
## (Intercept) -0.787766645  0.032558983  -24.20 <0.0000000000000002 ***
## CUR_AADT    -0.000161924  0.000005588  -28.98 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 8 
## Log-likelihood: -7.763e+04 on 5 Df

# Dispersion Statistic
sum(resid(fit_zip, type = "pearson")^2) / (nrow(DT) - length(coef(fit_zip))) # significant overdispersed (6x)

## [1] 5.966286

# goodness of fit:
1 - pchisq(sum(resid(fit_zip)^2), summary(fit_zip)$df.residual) # model does not fit the data (p < 0.05)

## [1] 0

# zero inflated negative binomial model
fit_zinb <- pscl::zeroinfl(total_crashes ~ lighting + mvmt | CUR_AADT, data = DT, dist = "negbin")
summary(fit_zinb)

## 
## Call:
## pscl::zeroinfl(formula = total_crashes ~ lighting + mvmt | CUR_AADT, 
##     data = DT, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1544 -0.7138 -0.2855  0.3816 15.7685 
## 
## Count model coefficients (negbin with log link):
##                Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)    1.748379   0.013345  131.01 <0.0000000000000002 ***
## lightingunlit -0.469159   0.015936  -29.44 <0.0000000000000002 ***
## mvmt           0.099827   0.001661   60.09 <0.0000000000000002 ***
## Log(theta)     0.287348   0.015793   18.20 <0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                 Estimate   Std. Error  z value             Pr(>|z|)    
## (Intercept) -0.566211732  0.077540336   -7.302    0.000000000000283 ***
## CUR_AADT    -0.000684352  0.000005587 -122.481 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 1.3329 
## Number of iterations in BFGS optimization: 43 
## Log-likelihood: -4.587e+04 on 6 Df

# Dispersion Statistic
sum(resid(fit_zinb, type = "pearson")^2) / (nrow(DT) - length(coef(fit_zinb)) + 1) # this is close to 1, which looks good

## [1] 1.04101

# goodness of fit:
1 - pchisq(sum(resid(fit_zinb)^2), summary(fit_zinb)$df.residual) # model doesn't fit but is closer than the others

## [1] 0.0001484503

# what does this look like?
synth <- data.table(lighting = c(rep("lit", 100), rep("unlit", 100)),
                    mvmt = seq(0.1, 70, length.out = 100))
fit_aadt <- lm(CUR_AADT ~ poly(mvmt, 3), data = DT)
synth$CUR_AADT <- predict(fit_aadt, newdata = synth)
synth$total_crashes <- predict(fit_zinb, newdata = synth)
ggplot(synth, aes(x = mvmt, y = total_crashes, color = lighting)) + 
  geom_point(data = DT, size = 0.5, alpha = 0.33) + 
  geom_line() +
  coord_cartesian(xlim = c(0, 40), ylim = c(0, 60)) + 
  ggtitle("Estimated Crashes and VMT", "By Lighting Condition")

Transportation Safety Analysis in R

Mark Egge

5/3/2020

Pennsylvania Road Lighting and Safety

Concepts Demonstrated