This script build a Support Vector Machine that predicts run status of the JULES-ES-1.0 ensemble. Can you make a better classifier?

set.seed(42)

library(RColorBrewer)
library(e1071)
library(MASS)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
load('data/wave0_summary.RData')

Visualise the data

There are 3 classes - “RAN” (JULES ran ok), “CRASHED” (JULES crashed, or did not run) and “LOWCARBON” (there is no functioning carbon cycle, though the model ran).

There are 499 ensemble members, run in a latin hypercube. lhs is raw, X is normalized [0-1].
Here is a plot of members that are “RAN” or “LOWCARBON”

d <- ncol(X)
lowcarbon_ix <- which(Y_char =='LOWCARBON')
crashed_ix <- which(Y_char == 'CRASHED')

x1 <- X[lowcarbon_ix , ]
x2 <- X[crashed_ix ,]

XY <- rbind(x1, x2)

pairs(XY,
      lower.panel=function(x, y, ...) {
        Xx <- x[seq_len(nrow(x1))] 
        Xy <- y[seq_len(nrow(x1))] 

        points(Xx, Xy, col = 'blue', pch = 19, cex = 0.8)
      }, 
      upper.panel=function(x, y, ...) {
        Yx <- x[(nrow(x1) + 1):length(x)]
        Yy <- y[(nrow(x1) + 1):length(y)] 
        

        points(Yx, Yy, col = 'red', pch = 19, cex = 0.8)

      }, 
      gap = 0,
      xlim = c(0,1), ylim = c(0,1),
      labels = 1:d,
      oma = c(2, 18, 2, 2)) # move the default tick labels off the plot 


reset <- function()
  {
  # Allows annotation of graphs, resets axes
  par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
  plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
reset()

legend('left', legend = paste(1:d, colnames(lhs)), cex = 1.1, bty = 'n')
legend('topleft', pch = 19, col = c('red', 'blue'), legend = c('failed', 'zero carbon cycle'), bty = 'n', inset = 0.02, cex = 1.1 )

Split the sample into a training and test set.

ix_train <- 1:399
ix_test <- 400:499

train <- cbind(wave0_summary_df[ix_train, 1:32], wave0_summary_df[ix_train, 'Y_char'] )
test <- cbind(wave0_summary_df[ix_test, 1:32], wave0_summary_df[ix_test, 'Y_char'] )


colnames(train) <- c(colnames(wave0_summary_df)[1:32], 'run_status')
colnames(test) <- c(colnames(wave0_summary_df)[1:32], 'run_status')

Simple non-probabilistic SVM classifier, using two known-important inputs

The classifier is much worse if we include all the inputs.

# Which inputs to select for classification training?

# Using all inputs is not good.
traincols <- 1:32
train_x <- train[, traincols]
train_y <- train[, 33]

class(train_x) <- 'numeric'

test_x <- test[, traincols]
test_y <- test[, 33]

class(test_x) <- 'numeric'
svm_fit <- svm(train_x, as.factor(train_y), kerneal = "radial", probability = FALSE)
svm_pred <- predict(svm_fit, newdata=test_x, probability = FALSE)
# Evaluate the model

confusionMatrix(svm_pred, as.factor(test_y))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  CRASHED LOWCARBON RAN
##   CRASHED         0         0   0
##   LOWCARBON       0         0   0
##   RAN             6        22  72
## 
## Overall Statistics
##                                           
##                Accuracy : 0.72            
##                  95% CI : (0.6213, 0.8052)
##     No Information Rate : 0.72            
##     P-Value [Acc > NIR] : 0.5507          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity                    0.00             0.00       1.00
## Specificity                    1.00             1.00       0.00
## Pos Pred Value                  NaN              NaN       0.72
## Neg Pred Value                 0.94             0.78        NaN
## Prevalence                     0.06             0.22       0.72
## Detection Rate                 0.00             0.00       0.72
## Detection Prevalence           0.00             0.00       1.00
## Balanced Accuracy              0.50             0.50       0.50
# Which inputs to select for classification training?
# Using two inputs we know to be important gives better results
traincols <- c(8,4)

train_x <- train[, traincols]
train_y <- train[, 33]

class(train_x) <- 'numeric'

test_x <- test[, traincols]
test_y <- test[, 33]

class(test_x) <- 'numeric'

class.weights = ‘inverse’ doesn’t work

wts <- c(5,2,1)
names(wts) <-  c('CRASHED', 'LOWCARBON', 'RAN')
svm_fit <- svm(train_x, as.factor(train_y), kernel = "radial", class.weights = wts, probability = FALSE)
svm_pred <- predict(svm_fit, newdata=test_x, probability = FALSE)
# Evaluate the model

confusionMatrix(svm_pred, as.factor(test_y))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  CRASHED LOWCARBON RAN
##   CRASHED         1         2   0
##   LOWCARBON       3        17   5
##   RAN             2         3  67
## 
## Overall Statistics
##                                           
##                Accuracy : 0.85            
##                  95% CI : (0.7647, 0.9135)
##     No Information Rate : 0.72            
##     P-Value [Acc > NIR] : 0.001678        
##                                           
##                   Kappa : 0.6469          
##                                           
##  Mcnemar's Test P-Value : 0.440227        
## 
## Statistics by Class:
## 
##                      Class: CRASHED Class: LOWCARBON Class: RAN
## Sensitivity                  0.1667           0.7727     0.9306
## Specificity                  0.9787           0.8974     0.8214
## Pos Pred Value               0.3333           0.6800     0.9306
## Neg Pred Value               0.9485           0.9333     0.8214
## Prevalence                   0.0600           0.2200     0.7200
## Detection Rate               0.0100           0.1700     0.6700
## Detection Prevalence         0.0300           0.2500     0.7200
## Balanced Accuracy            0.5727           0.8351     0.8760

Plot the non-probabilistic SVM.

We don’t predict any crashes, which might need looking at. Accuracy is 0.84, which feels like it might be improved.

upper <- apply(train_x, 2, max)
lower <- apply(train_x, 2, min)

x1_seq <- seq(from=lower[1], to=upper[1], length.out = 50)
x2_seq <- seq(from=lower[2], to=upper[2], length.out = 50)

# A grid of test points that covers the entire data space
test_grid <- as.matrix(expand.grid(x1_seq, x2_seq))
colnames(test_grid) <- colnames(train_x)

svm_pred_grid <- predict(svm_fit, newdata=test_grid, probability = FALSE)


pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
plot(test_grid, col = pal[svm_pred_grid], pch = 20, cex = 0.8, main = 'SVM classifier prediction')

points(train_x, bg = pal[as.factor(train_y)], cex = 1, pch =21)

legend('topleft', legend = unique(as.factor(train_y)),
pt.bg = pal[unique(as.factor(train_y))],
pch = 21,
col = 'black',
text.col= pal[unique(as.factor(train_y))],
 bg = 'white',
cex = 0.8)

Probabilistic classifier

svm_fit_prob <- svm(train_x, as.factor(train_y), probability = TRUE)
svm_pred_prob <- predict(svm_fit_prob, newdata=test_grid, probability = TRUE)
#confusionMatrix(svm_pred, as.factor(test_y))
svm_pred_prob_df <- attr(svm_pred_prob,'prob')

ranmat <- matrix(svm_pred_prob_df[,'RAN'], nrow = length(x1_seq))

lowcarbonmat <- matrix(svm_pred_prob_df[,'LOWCARBON'], nrow = length(x1_seq))
crashedmat <- matrix(svm_pred_prob_df[,'CRASHED'], nrow = length(x1_seq))
pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')


contour(x = x1_seq, y = x2_seq, ranmat,
    add = TRUE, levels = c(0.2, 0.4,0.6,0.8,1))

plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')

contour(x1_seq, x2_seq, lowcarbonmat,
    add = TRUE, levels = c(0.2,0.4,0.6,0.8,1), col =pal[2])

plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')

contour(x1_seq, x2_seq, crashedmat,
    add = TRUE,levels = c(0.5,0.6,0.7,0.8), col = pal[3])

pal <- c('red', 'blue','grey')
par(las = 1, cex = 1.3)
#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')


filled.contour(x = x1_seq, y = x2_seq, ranmat, levels = seq(from = 0, to = 1, by = 0.2), main = "RAN")

#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')

filled.contour(x = x1_seq, y = x2_seq, lowcarbonmat,levels = seq(from = 0, to = 1, by = 0.2), main = "LOWCARBON")

#plot(test_grid, type = 'n', cex = 0.8, main = 'SVM classifier prediction')

#contour(x1_seq, x2_seq, crashedmat,
#    add = TRUE,levels = c(0.5,0.6,0.7,0.8), col = pal[3])