This script build a Support Vector Machine that predicts run status of the JULES-ES-1.0 ensemble. Can you make a better classifier?
set.seed(42)
library(RColorBrewer)
library(e1071)
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
load('data/wave0_summary.RData')
There are 3 classes - “RAN” (JULES ran ok), “CRASHED” (JULES crashed, or did not run) and “LOWCARBON” (there is no functioning carbon cycle, though the model ran).
There are 499 ensemble members, run in a latin hypercube. lhs is raw, X is normalized [0-1].
Here is a plot of members that are “RAN” or “LOWCARBON”
d <- ncol(X)
lowcarbon_ix <- which(Y_char =='LOWCARBON')
crashed_ix <- which(Y_char == 'CRASHED')
x1 <- X[lowcarbon_ix , ]
x2 <- X[crashed_ix ,]
XY <- rbind(x1, x2)
pairs(XY,
lower.panel=function(x, y, ...) {
Xx <- x[seq_len(nrow(x1))]
Xy <- y[seq_len(nrow(x1))]
points(Xx, Xy, col = 'blue', pch = 19, cex = 0.8)
},
upper.panel=function(x, y, ...) {
Yx <- x[(nrow(x1) + 1):length(x)]
Yy <- y[(nrow(x1) + 1):length(y)]
points(Yx, Yy, col = 'red', pch = 19, cex = 0.8)
},
gap = 0,
xlim = c(0,1), ylim = c(0,1),
labels = 1:d,
oma = c(2, 18, 2, 2)) # move the default tick labels off the plot
reset <- function()
{
# Allows annotation of graphs, resets axes
par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
reset()
legend('left', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topleft', pch = 19, col = c('red', 'blue'), legend = c('failed', 'zero carbon cycle'), bty = 'n', inset = 0.02, cex = 1.1 )
ix_train <- 1:399
ix_test <- 400:499
train <- cbind(wave0_summary_df[ix_train, 1:32], wave0_summary_df[ix_train, 'Y_char'] )
test <- cbind(wave0_summary_df[ix_test, 1:32], wave0_summary_df[ix_test, 'Y_char'] )
colnames(train) <- c(colnames(wave0_summary_df)[1:32], 'run_status')
colnames(test) <- c(colnames(wave0_summary_df)[1:32], 'run_status')
The classifier is much worse if we include all the inputs.
# Which inputs to select for classification training?
# Using all inputs is not good.
traincols <- 1:32
train_x <- train[, traincols]
train_y <- train[, 33]
class(train_x) <- 'numeric'
test_x <- test[, traincols]
test_y <- test[, 33]
class(test_x) <- 'numeric'
svm_fit <- svm(train_x, as.factor(train_y), kerneal = "radial", probability = FALSE)
svm_pred <- predict(svm_fit, newdata=test_x, probability = FALSE)
# Evaluate the model
confusionMatrix(svm_pred, as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction CRASHED LOWCARBON RAN
## CRASHED 0 0 0
## LOWCARBON 0 0 0
## RAN 6 22 72
##
## Overall Statistics
##
## Accuracy : 0.72
## 95% CI : (0.6213, 0.8052)
## No Information Rate : 0.72
## P-Value [Acc > NIR] : 0.5507
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity 0.00 0.00 1.00
## Specificity 1.00 1.00 0.00
## Pos Pred Value NaN NaN 0.72
## Neg Pred Value 0.94 0.78 NaN
## Prevalence 0.06 0.22 0.72
## Detection Rate 0.00 0.00 0.72
## Detection Prevalence 0.00 0.00 1.00
## Balanced Accuracy 0.50 0.50 0.50
# Which inputs to select for classification training?
# Using two inputs we know to be important gives better results
traincols <- c(8,4)
train_x <- train[, traincols]
train_y <- train[, 33]
class(train_x) <- 'numeric'
test_x <- test[, traincols]
test_y <- test[, 33]
class(test_x) <- 'numeric'
class.weights = ‘inverse’ doesn’t work
wts <- c(5,2,1)
names(wts) <- c('CRASHED', 'LOWCARBON', 'RAN')
svm_fit <- svm(train_x, as.factor(train_y), kernel = "radial", class.weights = wts, probability = FALSE)
svm_pred <- predict(svm_fit, newdata=test_x, probability = FALSE)
# Evaluate the model
confusionMatrix(svm_pred, as.factor(test_y))
## Confusion Matrix and Statistics
##
## Reference
## Prediction CRASHED LOWCARBON RAN
## CRASHED 1 2 0
## LOWCARBON 3 17 5
## RAN 2 3 67
##
## Overall Statistics
##
## Accuracy : 0.85
## 95% CI : (0.7647, 0.9135)
## No Information Rate : 0.72
## P-Value [Acc > NIR] : 0.001678
##
## Kappa : 0.6469
##
## Mcnemar's Test P-Value : 0.440227
##
## Statistics by Class:
##
## Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity 0.1667 0.7727 0.9306
## Specificity 0.9787 0.8974 0.8214
## Pos Pred Value 0.3333 0.6800 0.9306
## Neg Pred Value 0.9485 0.9333 0.8214
## Prevalence 0.0600 0.2200 0.7200
## Detection Rate 0.0100 0.1700 0.6700
## Detection Prevalence 0.0300 0.2500 0.7200
## Balanced Accuracy 0.5727 0.8351 0.8760
We don’t predict any crashes, which might need looking at. Accuracy is 0.84, which feels like it might be improved.
upper <- apply(train_x, 2, max)
lower <- apply(train_x, 2, min)
x1_seq <- seq(from=lower[1], to=upper[1], length.out = 50)
x2_seq <- seq(from=lower[2], to=upper[2], length.out = 50)
# A grid of test points that covers the entire data space
test_grid <- as.matrix(expand.grid(x1_seq, x2_seq))
colnames(test_grid) <- colnames(train_x)
svm_pred_grid <- predict(svm_fit, newdata=test_grid, probability = FALSE)
pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
plot(test_grid, col = pal[svm_pred_grid], pch = 20, cex = 0.8, main = 'SVM classifier prediction')
points(train_x, bg = pal[as.factor(train_y)], cex = 1, pch =21)
legend('topleft', legend = unique(as.factor(train_y)),
pt.bg = pal[unique(as.factor(train_y))],
pch = 21,
col = 'black',
text.col= pal[unique(as.factor(train_y))],
bg = 'white',
cex = 0.8)
svm_fit_prob <- svm(train_x, as.factor(train_y), probability = TRUE)
svm_pred_prob <- predict(svm_fit_prob, newdata=test_grid, probability = TRUE)
#confusionMatrix(svm_pred, as.factor(test_y))
svm_pred_prob_df <- attr(svm_pred_prob,'prob')
ranmat <- matrix(svm_pred_prob_df[,'RAN'], nrow = length(x1_seq))
lowcarbonmat <- matrix(svm_pred_prob_df[,'LOWCARBON'], nrow = length(x1_seq))
crashedmat <- matrix(svm_pred_prob_df[,'CRASHED'], nrow = length(x1_seq))
pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
contour(x = x1_seq, y = x2_seq, ranmat,
add = TRUE, levels = c(0.2, 0.4,0.6,0.8,1))
plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
contour(x1_seq, x2_seq, lowcarbonmat,
add = TRUE, levels = c(0.2,0.4,0.6,0.8,1), col =pal[2])
plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
contour(x1_seq, x2_seq, crashedmat,
add = TRUE,levels = c(0.5,0.6,0.7,0.8), col = pal[3])
pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
filled.contour(x = x1_seq, y = x2_seq, ranmat, levels = seq(from = 0, to = 1, by = 0.2), main = "RAN")
#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
filled.contour(x = x1_seq, y = x2_seq, lowcarbonmat,levels = seq(from = 0, to = 1, by = 0.2), main = "LOWCARBON")
#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')
#contour(x1_seq, x2_seq, crashedmat,
# add = TRUE,levels = c(0.5,0.6,0.7,0.8), col = pal[3])