# Load necessary libraries
library(dplyr)
setwd("C:/Users/rsb84/Desktop/RB/ds_projects/GIS/DRC/")
final_net.sf.attacks_test <- readRDS(file = "final_net.sf.attacks_test-after_adding_HiSig_and_HiSig.dist.rds")
combined_predictions <- readRDS("combined_models_predictions_across_territories.rds")
# Merge predictions with test data based on uniqueID
data_combined <- final_net.sf.attacks_test %>%
left_join(combined_predictions, by = "uniqueID") %>%
dplyr::select(uniqueID, NAME_3, countAttacks, Predicted_Value)
# Define presence of attacks
data_combined <- data_combined %>%
mutate(
actual_presence = ifelse(countAttacks >= 1, 1, 0),
predicted_presence = ifelse(Predicted_Value >= 0.5, 1, 0)
)
# saveRDS(data_combined, "data_combined.rds")
# data_combined <- readRDS("data_combined.rds")
# For each territory, calculate the percentage of grid cells where actual attacks occurred and predicted attacks also occurred
results_code <- data_combined %>%
group_by(NAME_3) %>% # Group data by territory
summarise(
total_grid_cells = n(), # Total number of uniqueID-based grid cells in each territory
matched_cells = sum(ifelse(
(actual_presence == 1 & predicted_presence == 1) |
(actual_presence == 0 & predicted_presence == 0), 1, 0
)),
percentage_correct = (matched_cells / total_grid_cells) * 100 # Calculate percentage of matched grid cells
) %>%
as.data.frame() %>%
dplyr::select(-geometry)
# Display the results
print(results_code)
# > print(results_code)
# NAME_3 total_grid_cells matched_cells percentage_correct
# 1 Aru 334 333 99.70060
# 2 Beni 380 359 94.47368
# 3 Bukavu 20 16 80.00000
# 4 Djugu 427 412 96.48712
# 5 Fizi 656 644 98.17073
# 6 Goma 38 34 89.47368
# 7 Idjwi 40 40 100.00000
# 8 Irumu 365 355 97.26027
# 9 Kabare 230 229 99.56522
# 10 Kalehe 227 224 98.67841
# 11 Lubero 765 760 99.34641
# 12 Mahagi 266 262 98.49624
# 13 Mambasa 1477 1472 99.66148
# 14 Masisi 167 148 88.62275
# 15 Mwenga 478 473 98.95397
# 16 Rutshuru 263 241 91.63498
# 17 Shabunda 1030 1028 99.80583
# 18 Uvira 159 153 96.22642
# 19 Walikale 1043 1041 99.80825
# 20 Walungu 44 43 97.72727
print(mean(results_code$percentage_correct))
# > print(mean(results_code$percentage_correct))
# [1] 96.20467
print(median(results_code$percentage_correct))
# > print(median(results_code$percentage_correct))
# [1] 98.33349
print(sum(data_combined$actual_presence))
# > print(sum(data_combined$actual_presence))
# [1] 136
print(sum(data_combined$predicted_presence))
# > print(sum(data_combined$predicted_presence))
# [1] 156
print(sum(data_combined$countAttacks))
# > print(sum(data_combined$countAttacks))
# [1] 268
print(sum(custom_round(data_combined$Predicted_Value)))
# > print(sum(custom_round(data_combined$Predicted_Value)))
# [1] 237
# Precision
results_precision <- data_combined %>%
group_by(NAME_3) %>%
summarise(
total_predicted_positive = sum(predicted_presence == 1),
true_positives = sum(predicted_presence == 1 & actual_presence == 1),
precision = (true_positives / total_predicted_positive) * 100
) %>%
as.data.frame() %>%
dplyr::select(-geometry)
print(results_precision)
# > print(results_precision)
# NAME_3 total_predicted_positive true_positives precision
# 1 Aru 1 0 0.00000
# 2 Beni 29 13 44.82759
# 3 Bukavu 2 0 0.00000
# 4 Djugu 16 6 37.50000
# 5 Fizi 11 5 45.45455
# 6 Goma 8 6 75.00000
# 7 Idjwi 0 0 NaN
# 8 Irumu 13 11 84.61538
# 9 Kabare 0 0 NaN
# 10 Kalehe 3 1 33.33333
# 11 Lubero 5 2 40.00000
# 12 Mahagi 1 1 100.00000
# 13 Mambasa 11 7 63.63636
# 14 Masisi 20 5 25.00000
# 15 Mwenga 5 2 40.00000
# 16 Rutshuru 17 6 35.29412
# 17 Shabunda 2 0 0.00000
# 18 Uvira 6 6 100.00000
# 19 Walikale 5 3 60.00000
# 20 Walungu 1 1 100.00000
print(mean(results_precision$precision, na.rm = TRUE))
# > print(mean(results_precision$precision, na.rm = TRUE))
# [1] 49.14785
print(median(results_precision$precision, na.rm = TRUE))
# > print(median(results_precision$precision, na.rm = TRUE))
# [1] 42.41379
# Recall
results_recall <- data_combined %>%
group_by(NAME_3) %>%
summarise(
total_actual_positive = sum(actual_presence == 1),
true_positives = sum(predicted_presence == 1 & actual_presence == 1),
recall = (true_positives / total_actual_positive) * 100
) %>%
as.data.frame() %>%
dplyr::select(-geometry)
print(results_recall)
# > print(results_recall)
# NAME_3 total_actual_positive true_positives recall
# 1 Aru 0 0 NaN
# 2 Beni 18 13 72.22222
# 3 Bukavu 2 0 0.00000
# 4 Djugu 11 6 54.54545
# 5 Fizi 11 5 45.45455
# 6 Goma 8 6 75.00000
# 7 Idjwi 0 0 NaN
# 8 Irumu 19 11 57.89474
# 9 Kabare 1 0 0.00000
# 10 Kalehe 2 1 50.00000
# 11 Lubero 4 2 50.00000
# 12 Mahagi 5 1 20.00000
# 13 Mambasa 8 7 87.50000
# 14 Masisi 9 5 55.55556
# 15 Mwenga 4 2 50.00000
# 16 Rutshuru 17 6 35.29412
# 17 Shabunda 0 0 NaN
# 18 Uvira 12 6 50.00000
# 19 Walikale 3 3 100.00000
# 20 Walungu 2 1 50.00000
print(mean(results_recall$recall, na.rm = TRUE))
# > print(mean(results_recall$recall, na.rm = TRUE))
# [1] 50.20392
print(median(results_recall$recall, na.rm = TRUE))
# > print(median(results_recall$recall, na.rm = TRUE))
# [1] 50
Specificity = the number of true negatives (correctly predicted non-attack locations) / (the number of true negatives + the number of false positives - non-attack locations falsely predicted to be attack locations)
.Sensitivity/Recall = the number of true positives (correctly predicted attack locations) / the total number of actual attack locations (which is the same as the number of True Positives + the number of false negatives - locations incorrectly predicted to be non-attack locations)
.# Area Under the ROC Curve:
library(pROC)
# Compute ROC curve
roc_obj <- roc(response = data_combined$actual_presence, predictor = data_combined$Predicted_Value)
# saveRDS(roc_obj, "roc_obj.rds")
# Calculate AUROC
auc_value <- auc(roc_obj)
# Print AUROC
print(paste("Area Under the ROC Curve (AUROC):", round(auc_value, 4)))
# [1] "Area Under the ROC Curve (AUROC): 0.877"
# Plot ROC curve
plot(roc_obj, main = "ROC Curve (AUC = 0.877)", cex.main = 0.8, cex.lab = 0.8, cex.axis = 0.5)
# Next, let's plot the Precision-Recall Curve and calculate the PRAUC
library(PRROC)
library(fields)
labels <- as.numeric(as.character(data_combined$actual_presence))
# Understanding Why It is Best Not to Use Regression Predicted Values for the Precision-Recall Curve:
# The Precision-Recall (PR) curve is a classification metric that requires:
#
# Continuous scores representing the likelihood of the positive class.
# Higher scores indicating a higher probability of being in the positive class.
# Problems in Our Case:
# - Negative Predicted Values: Negative values from our regression model do not intuitively represent probabilities or likelihoods, although they will not result in an inaccurate Area Under the Precision-Recall Curve (AUPRC) score.
# - Inconsistent Score Interpretations: If higher scores correspond to higher predicted counts (and hence higher likelihood of attacks), negative scores complicate this interpretation.
# - They will result in a threshold color gradient legend that ranges from the most negative value to the most positive value.
# Alternative: Apply a Sigmoid Function
# - Transform Predictions: Use a sigmoid function to map predicted values to a (0, 1) range. The Precision-Recall (PR) curve depends on the ranking of scores, not their absolute values. Since the sigmoid transformation preserves the ranking, the PR curve remains unchanged. Since the PR curve will not change, the AUPRC score will remain the same.
# Note: It would not be a good idea to convert all negative predicted values to zeroes before applying the sigmoid function, despite the fact that negative predictions are logically are essentially predicting zeroes. The reason is that doing so would artificially assign all negative predictions a 50% probability since values of 0 have a sigmoid probability of 50%, which may not correctly reflect a given prediction's true likelihood of an attack. This can lead to potential misrankings of risks made by the predictions. A predicted value that is more negative than another negative predicted value should receive a higher risk. Keeping all negative values as they are and then applying the sigmoid function preserves the risk levels of every prediction.
# Predicted scores
scores <- data_combined$Predicted_Value
scores_transformed <- 1 / (1 + exp(-scores))
# Compute Precision-Recall curve
pr <- pr.curve(scores.class0 = scores_transformed[labels == 1],
scores.class1 = scores_transformed[labels == 0],
curve = TRUE)
# scores.class0 argument must take the positive class (presence of attacks)
# scores.class1 argument must take the negative class (absence of attacks)
# scores_transformed[labels == 1] contains predicted values for grid cells where attacks actually occurred.
# scores_transformed[labels == 0] contains predicted values for grid cells where attacks did not actually occur.
saveRDS(pr, "pr.rds")
# Print AUPRC
print(paste("Area Under the Precision-Recall Curve (AUPRC):", round(pr$auc.integral, 4)))
# [1] "Area Under the Precision-Recall Curve (AUPRC): 0.4835"
# Output plot to file
png("Precision-Recall Curve.png", width = 800, height = 800)
# Plot the Precision-Recall Curve
plot(pr, main = "Precision-Recall Curve\n(Color Gradient: Thresholds at which Precision & Recall are Computed)", cex.main = 1.3, cex.lab = 1.2, cex.axis = 1.2)
# Calculating F1-Beta Scores
library(caret)
# Create a confusion matrix
conf_mat <- confusionMatrix(factor(data_combined$predicted_presence),
factor(data_combined$actual_presence),
positive = "1")
# Extract precision and recall
precision <- conf_mat$byClass['Precision']
recall <- conf_mat$byClass['Recall']
# Specify beta value (e.g., beta = 1 for standard F1 score)
beta <- 1
# Calculate F1-Beta score
f1_beta_score <- (1 + beta^2) * (precision * recall) / ((beta^2 * precision) + recall)
# Print F1-Beta score
print(paste("F1-Beta Score (beta =", beta, "):", round(f1_beta_score, 4)))
# [1] "F1-Beta Score (beta = 1 ): 0.5137"
# Calculating F2-Beta Scores
library(caret)
# Create a confusion matrix
conf_mat <- confusionMatrix(factor(data_combined$predicted_presence),
factor(data_combined$actual_presence),
positive = "1")
# Extract precision and recall
precision <- conf_mat$byClass['Precision']
recall <- conf_mat$byClass['Recall']
# Specify beta value (e.g., β = 2: Recall is given four times the weight of precision)
beta <- 2
# Calculate F1-Beta score
f2_beta_score <- (1 + beta^2) * (precision * recall) / ((beta^2 * precision) + recall)
# Print F1-Beta score
print(paste("F2-Beta Score (beta =", beta, "):", round(f2_beta_score, 4)))
# [1] "F2-Beta Score (beta = 2 ): 0.5357"
repeated_spcv_block
cross-validation for Uvira (absolute
difference of 2), all 3 of which are among 10 territories experiencing
the most attacks.# Load necessary library
library(pROC)
library(dplyr)
# Initialize an empty data frame to store AUROC results
results_auroc <- data.frame(NAME_3 = character(), AUROC = numeric(), stringsAsFactors = FALSE)
# Get unique territories
territories <- unique(data_combined$NAME_3)
# Loop over each territory
for (territory in territories) {
# Subset data for the current territory
territory_data <- data_combined %>% filter(NAME_3 == territory)
# Check if there are both positive and negative classes
if (length(unique(territory_data$actual_presence)) > 1) {
# Compute ROC curve
roc_obj <- roc(response = territory_data$actual_presence, predictor = territory_data$Predicted_Value)
# Calculate AUROC
auc_value <- auc(roc_obj)
} else {
# Assign NA if only one class is present
auc_value <- NA
}
# Append results
results_auroc <- rbind(results_auroc, data.frame(NAME_3 = territory, AUROC = auc_value, stringsAsFactors = FALSE))
}
# Load necessary library
library(PRROC)
library(dplyr)
# Initialize an empty data frame to store AUPRC results
results_auprc <- data.frame(NAME_3 = character(), AUPRC = numeric(), stringsAsFactors = FALSE)
# Get unique territories
territories <- unique(data_combined$NAME_3)
# Loop over each territory
for (territory in territories) {
# Subset data for the current territory
territory_data <- data_combined %>% filter(NAME_3 == territory)
# Check if both classes are present
if (length(unique(territory_data$actual_presence)) > 1) {
# Convert actual_presence to numeric
labels <- as.numeric(as.character(territory_data$actual_presence))
scores <- territory_data$Predicted_Value
# Compute Precision-Recall curve
pr <- pr.curve(scores.class0 = scores[labels == 1],
scores.class1 = scores[labels == 0],
curve = FALSE)
# Get AUPRC
auprc_value <- pr$auc.integral
} else {
# Assign NA if only one class is present
auprc_value <- NA
}
# Append results
results_auprc <- rbind(results_auprc, data.frame(NAME_3 = territory, AUPRC = auprc_value, stringsAsFactors = FALSE))
}
# Load necessary library
library(caret)
library(dplyr)
# Initialize an empty data frame to store F1-Beta results
results_f1_beta <- data.frame(NAME_3 = character(), F1_Beta = numeric(), stringsAsFactors = FALSE)
# Get unique territories
territories <- unique(data_combined$NAME_3)
# Loop over each territory
for (territory in territories) {
# Subset data for the current territory
territory_data <- data_combined %>% filter(NAME_3 == territory)
# Check if both classes are present
if (length(unique(territory_data$actual_presence)) > 1) {
# Create confusion matrix
conf_mat <- confusionMatrix(factor(territory_data$predicted_presence),
factor(territory_data$actual_presence),
positive = "1")
# Extract precision and recall
precision <- conf_mat$byClass['Precision']
recall <- conf_mat$byClass['Recall']
# Handle cases where precision or recall is NA or NaN
if (is.na(precision) | is.na(recall) | is.nan(precision) | is.nan(recall) | (precision == 0 & recall == 0)) {
f1_beta_score <- NA
} else {
# Specify beta value
beta <- 1
# Calculate F1-Beta score
f1_beta_score <- (1 + beta^2) * (precision * recall) / ((beta^2 * precision) + recall)
}
} else {
# Assign NA if only one class is present
f1_beta_score <- NA
}
# Append results
results_f1_beta <- rbind(results_f1_beta, data.frame(NAME_3 = territory, F1_Beta = as.numeric(f1_beta_score), stringsAsFactors = FALSE))
}
# Load necessary library
library(caret)
library(dplyr)
# Initialize an empty data frame to store F2-Beta results
results_f2_beta <- data.frame(NAME_3 = character(), F2_Beta = numeric(), stringsAsFactors = FALSE)
# Get unique territories
territories <- unique(data_combined$NAME_3)
# Loop over each territory
for (territory in territories) {
# Subset data for the current territory
territory_data <- data_combined %>% filter(NAME_3 == territory)
# Check if both classes are present
if (length(unique(territory_data$actual_presence)) > 1) {
# Create confusion matrix
conf_mat <- confusionMatrix(factor(territory_data$predicted_presence),
factor(territory_data$actual_presence),
positive = "1")
# Extract precision and recall
precision <- conf_mat$byClass['Precision']
recall <- conf_mat$byClass['Recall']
# Handling cases where precision or recall is NA or NaN.
# NA means Not Available in R (representing missing or undefined data), and NaN means not a number (representing an undefined or unrepresentable value resulting from an invalid mathematical operation, such as 0/0). Only in the Bukavu territory are both the precision and recall values zero rather than NaN (as in the case of some other territories). Division by Zero: When both precision and recall are zero, the calculation of the F1-Beta score involves dividing zero by zero, which results in NaN (0/0 is undefined and results in NaN in R). However, in the code below We will change these NaN values to NA for consistency in the output.
if (is.na(precision) | is.na(recall) | is.nan(precision) | is.nan(recall) | (precision == 0 & recall == 0)) {
f2_beta_score <- NA
} else {
# Specify beta value
beta <- 2
# Calculate F2-Beta score
f2_beta_score <- (1 + beta^2) * (precision * recall) / ((beta^2 * precision) + recall)
}
} else {
# Assign NA if only one class is present
f2_beta_score <- NA
}
# Append results
results_f2_beta <- rbind(results_f2_beta, data.frame(NAME_3 = territory, F2_Beta = as.numeric(f2_beta_score), stringsAsFactors = FALSE))
}
# Combine all results into one data frame
results_combined <- results_auroc %>%
left_join(results_auprc, by = "NAME_3") %>%
left_join(results_f1_beta, by = "NAME_3") %>%
left_join(results_f2_beta, by = "NAME_3")
# Sort results_combined by NAME_3 in alphabetical order
results_combined_sorted <- results_combined %>%
arrange(NAME_3) %>%
mutate(Territory = NAME_3) %>%
dplyr::select(Territory, everything(), -NAME_3)
# Display the sorted results
print(results_combined_sorted)
# > print(results_combined_sorted)
# Territory AUROC AUPRC F1_Beta F2_Beta
# 1 Aru NA NA NA NA
# 2 Beni 0.9732965 0.52056725 0.5531915 0.6435644
# 3 Bukavu 0.8055556 0.06099891 NA NA
# 4 Djugu 0.9851399 0.57756502 0.4444444 0.5000000
# 5 Fizi 0.8023961 0.31469429 0.4545455 0.4545455
# 6 Goma 0.9541667 0.86893537 0.7500000 0.7500000
# 7 Idjwi NA NA NA NA
# 8 Irumu 0.9809857 0.87615465 0.6875000 0.6179775
# 9 Kabare 0.9126638 0.02272153 NA NA
# 10 Kalehe 0.9577778 0.53649344 0.4000000 0.4545455
# 11 Lubero 0.8628449 0.40988808 0.4444444 0.4761905
# 12 Mahagi 0.6812261 0.60763569 0.3333333 0.2380952
# 13 Mambasa 0.9921290 0.56282903 0.7368421 0.8139535
# 14 Masisi 0.6469761 0.31052692 0.3448276 0.4464286
# 15 Mwenga 0.9765295 0.24563139 0.4444444 0.4761905
# 16 Rutshuru 0.6690579 0.32795512 0.3529412 0.3529412
# 17 Shabunda NA NA NA NA
# 18 Uvira 0.8222789 0.63377255 0.6666667 0.5555556
# 19 Walikale 0.9996795 0.90410598 0.7500000 0.8823529
# 20 Walungu 0.8809524 0.56494312 0.6666667 0.5555556
Examples include:
A lag variable averaging the number of actual attack locations or averaging the number of attacks from nearby grid cells.
Lag variables averaging the values of other features in neighboring cells surrounding each grid cell location.
A spatial-temporal lag variable calculating the average predicted values from neighboring grid cells (and/or the average number of attacks) from prior time-steps. This would necessitate modifying my dataset to include spatial-temporal data for all of my features and target variable, and would require the use of cross-validation strategies appropriate for spatial-temporal data.