This notebook build a set high-dimensional emulators of timeseries of JULES global mean carbon cycle outputs. It uses PCA to reduce the dimension of the output, and build a Gaussian Process emulator of the output in reduced dimension.
This notebook builds on the work of explore-jules-timeseries-emulate, and applies the emulator to multiple timeseries.
# Libraries
library(DiceKriging)
library(parallel)
# Helper functions
anomalizeTSmatrix = function(x, ix){
# Anomalise a timeseries matrix
subx = x[ ,ix]
sweepstats = apply(subx, 1, FUN=mean)
anom = sweep(x, 1, sweepstats, FUN = '-')
anom
}
mat2list <- function(X){
# Turns the p columns of a matrix into a p length list,
# with each column becoming an element of the list
out <- vector(mode = 'list', length = ncol(X))
for(i in 1:ncol(X)){
out[[i]] <- X[ , i]
}
out
}
reset <- function() {
# Allows annotation of graphs, resets axes
par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
makeTransparent<-function(someColor, alpha=100)
# Transparent colours for plotting
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
pca_km <- function(X, Y, train_ix, test_ix, npctol = 0.99, scale = FALSE, center = TRUE, ...){
# emulate high dimensional output
require(parallel)
# Split into training and test samples
X_train <- X[train_ix, ]
X_test <- X[test_ix, ]
Y_train <- Y[train_ix, ]
Y_test <- Y[test_ix, ]
#reduce dimension of the output
pca <- prcomp(Y_train, scale = scale, center = center)
# choose a number of pcs
pca_summary <- summary(pca)
# 2 PCs minimum
if(pca_summary$importance[3,2] < npctol){
npc <- as.numeric(which(pca_summary$importance[3,] > npctol)[1])
}
else{
npc <- 2
}
scores_train_list <- mat2list(pca$x[, 1:npc])
# fitting the emulator is a slow step, so we use parallel computation
# Fit an emulator to each principal component in turn
fit_list <- mclapply(X = scores_train_list, FUN = km, formula = ~., design = X_train,
mc.cores = 4, control = list(trace = FALSE, maxit = 200))
scores_em_test_mean <- NULL
scores_em_test_sd <- NULL
scores_em_train_mean <- NULL
scores_em_train_sd <- NULL
for(i in 1:npc){
loo <- leaveOneOut.km(fit_list[[i]], type = 'UK', trend.reestim = TRUE)
pred_test <- predict(fit_list[[i]], newdata = X_test, type = 'UK')
# Predict training data (low dimension representation)
scores_em_train_mean <- cbind(scores_em_train_mean, loo$mean)
scores_em_train_sd <- cbind(scores_em_train_sd, loo$sd)
# Predict test data (low dimension representation)
scores_em_test_mean <- cbind(scores_em_test_mean, pred_test$mean)
scores_em_test_sd <- cbind(scores_em_test_sd, pred_test$sd)
}
# Project back up to high dimension
if(scale){
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])*pca$scale
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])*pca$scale
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])*pca$scale)
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])*pca$scale)
}
else{
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc]))
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc]))
}
Ypred_train <- t(anom_train + pca$center)
Ypred_test <- t(anom_test + pca$center)
Yerr_train <- Y_train - Ypred_train
Yerr_test <- Y_test - Ypred_test
return(list(X = X,
Y = Y,
train_ix = train_ix,
test_ix = test_ix,
X_train = X_train,
X_test = X_test,
npc = npc,
pca = pca,
fit_list = fit_list,
scores_em_train_mean = scores_em_train_mean,
scores_em_train_sd = scores_em_train_sd,
scores_em_test_mean = scores_em_test_mean,
scores_em_test_sd = scores_em_test_sd,
Ypred_train = Ypred_train,
Ypred_test = Ypred_test,
sd_train = sd_train,
sd_test = sd_test,
Yerr_train = Yerr_train,
Yerr_test = Yerr_test
))
}
pca_km_errorStats <- function(fit, nround = 2, compare_ix = NULL){
# Calculate the error statistics from a pca_km object
# fit ........... output object from pca_km
# nround ........ decimals to round in output
# compare_ix..... indices of the training set to calculate error stats for
if(is.null(compare_ix)){
compare_ix <- 1:nrow(fit$Yerr_train)
}
else{ compare_ix <- compare_ix}
trainMAE <- round(mean(abs(fit$Yerr_train[compare_ix, ])), nround)
testMAE <- round(mean(abs(fit$Yerr_test)), nround)
return(list(trainMAE = trainMAE,
testMAE = testMAE))
}
pca_km_tsSparkPlot <- function(pca_km_obj, nr, nc, transp, maintext, yrs, obscol = 'black', predcol = 'red', ...){
# Timeseries test set prediction sparkline plot (small multiples)
par(mfrow = c(nr,nc), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(1,0.1,5,0.1))
ylim <- range(
(pca_km_obj$Ypred_test + 2*(pca_km_obj$sd_test)),
( pca_km_obj$Ypred_test - 2*(pca_km_obj$sd_test)))
for(i in 1:length(pca_km_obj$test_ix)){
plot(yrs, (pca_km_obj$Y[test_ix, ])[i, ], ylim = ylim, type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(yrs, (pca_km_obj$Y[test_ix, ])[i, ], col = obscol, lwd = 1.5)
lines(yrs, pca_km_obj$Ypred_test[i, ], col = predcol, lwd = 1.5)
lines(yrs, pca_km_obj$Ypred_test[i, ] + 2*(pca_km_obj$sd_test[i, ]), col = makeTransparent(predcol,transp))
lines(yrs, pca_km_obj$Ypred_test[i, ] - 2*(pca_km_obj$sd_test[i, ]), col = makeTransparent(predcol,transp))
}
mtext(maintext, side = 3, outer = TRUE, cex = 1.5, line = 1)
reset()
legend('topleft', col = c(obscol, predcol, makeTransparent(predcol,transp)),
legend = c('observed', 'predicted', '+-2sd'), lty = 'solid', horiz = TRUE, bty = 'n')
}
# A function to identify rows with outlier data (from Google Bard, adapted to update threshold)
findAnomalies <- function(matrix, iqrThres = 1.5, madThres = 1.5, na.rm = FALSE) {
# Check for NA values
if (!na.rm) {
naRows <- which(apply(matrix, 1, is.na))
if (length(naRows) > 0) {
return(naRows)
}
}
# Check for outliers using IQR method
iqr <- IQR(matrix, na.rm = na.rm)
q1 <- quantile(matrix, 0.25, na.rm = na.rm)
q3 <- quantile(matrix, 0.75, na.rm = na.rm)
outlierRows <- which(apply(matrix, 1, function(row) {
any(row < q1 - iqrThres * iqr | row > q3 + iqrThres * iqr)
}))
# Check for discontinuities
discontinuityRows <- which(apply(matrix, 1, function(row) {
any(diff(row) > madThres * mad(row))
}))
# Combine NA, outlier, and discontinuity rows
anomalyRows <- union(union(naRows, outlierRows), discontinuityRows)
# Return index of anomaly rows
return(anomalyRows)
}
findAnomaliesTSLocal <- function(matrix, iqrThres = 5, madThres = 3, na.rm = FALSE) {
# Find outlier runs in an ensemble
# Check for NA values
if (!na.rm) {
naRows <- which(apply(matrix, 1, is.na))
if (length(naRows) > 0) {
return(naRows)
}
}
# find the local (per column) IQR
iqr <- apply(matrix, 2, FUN = IQR, na.rm = na.rm)
q1 <- apply(matrix,2, FUN = quantile, probs = 0.25)
q3 <- apply(matrix,2, FUN = quantile, probs = 0.75)
outlierRows <- which(apply(matrix, 1, function(row) {
any(row < q1 - iqrThres * iqr | row > q3 + iqrThres * iqr)
}))
# Check for discontinuities
discontinuityRows <- which(apply(matrix, 1, function(row) {
any(diff(row) > madThres * mad(row))
}))
# Combine NA, outlier, and discontinuity rows
anomalyRows <- union(union(naRows, outlierRows), discontinuityRows)
# Return index of anomaly rows
return(anomalyRows)
}
# Load data
load('data/ensemble-jules-historical-timeseries.RData')
# Use only the post-1950 years
years_ix <- 101:164
years_trunc <- years[years_ix]
For each input/output set, it would be useful to have some functions which identify ensemble members which have “bad” data. Outliers, or NAs, for example.
cnbp_all_ens_wave00 <- t(apply(nbp_ens_wave00, 1, FUN = cumsum))
cnbp_all_ens_wave01 <- t(apply(nbp_ens_wave01, 1, FUN = cumsum))
par(las = 1)
cnbp_all_col <- c(rep(makeTransparent('black',100), nrow(cnbp_all_ens_wave00)), rep(makeTransparent('red',100), nrow(cnbp_all_ens_wave01)) )
cnbp_all <- rbind(cnbp_all_ens_wave00, cnbp_all_ens_wave01)
matplot(years, t(cnbp_all_ens_wave00), type = 'l', lty = 'solid', col = makeTransparent('black',100), ylim = c(-130, 260), lwd =2,
ylab = 'CNBP (GtC/yr)', main = 'Cumulative NBP')
legend('topleft', legend = c('Wave00', 'Wave01' ), col =c('black', 'red'), lty = 'solid')
#matplot(years, t(cnbp_all), type = 'l', lty = 'solid', col = cnbp_all_col, ylim = c(-130, 260), lwd =2,
# ylab = 'CNBP (GtC/yr)', main = 'Cumulative NBP')
par(las = 1)
cnbp_all_col <- c(rep(makeTransparent('black',100), nrow(cnbp_all_ens_wave00)), rep(makeTransparent('red',100), nrow(cnbp_all_ens_wave01)) )
cnbp_all <- rbind(cnbp_all_ens_wave00, cnbp_all_ens_wave01)
matplot(years, t(cnbp_all_ens_wave00), type = 'l', lty = 'solid', col = makeTransparent('black',100), ylim = c(-130, 260), lwd =2,
ylab = 'CNBP (GtC/yr)', main = 'Cumulative NBP')
legend('topleft', legend = c('Wave00', 'Wave01' ), col =c('black', 'red'), lty = 'solid')
matlines(years, t(cnbp_all_ens_wave01), type = 'l', lty = 'solid', col = makeTransparent('red',100), lwd =2)
cnbp_ens_wave00 <- t(apply(nbp_ens_wave00[ , years_ix], 1, FUN = cumsum))
cnbp_ens_wave01 <- t(apply(nbp_ens_wave01[ , years_ix], 1, FUN = cumsum))
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(cnbp_ens_wave01, iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
Y_combine <- rbind(cnbp_ens_wave00[-kill_ix_wave00,], cnbp_ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(cnbp_ens_wave00[kill_ix_wave00,], cnbp_ens_wave01[kill_ix_wave01, ])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid',
col= c( rep('black', nrow(cnbp_ens_wave00[-kill_ix_wave00, ])), rep('red', nrow(cnbp_ens_wave01[-kill_ix_wave01, ]))),
main = 'retained runs', ylab = 'CNBP (GtC/yr)', xlab = 'year')
legend('topleft', legend = c('Wave00', 'Wave01' ), col =c('black', 'red'), lty = 'solid')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid', ylim = c(-100, 100),
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs', ylab = 'CNBP (GtC/yr)', xlab = 'year')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_cnbp_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
par(mfrow = c(1,4), las = 1)
cnbp_pca_summary <- summary(pc_km_cnbp_combine$pca)
plot(1:10, cnbp_pca_summary$importance[3,1:10], type = 'b', xlab = 'PCs', ylab = 'Proportion of variance explained')
abline(h = 0.99, lty = 'dashed')
for(i in 1:3){
plot(years_trunc, pc_km_cnbp_combine$pca$rotation[,i], type = 'l', ylab = '', xlab = 'year')
}
pca_km_errorStats(pc_km_cnbp_combine)
## $trainMAE
## [1] 5.45
##
## $testMAE
## [1] 7.23
par(mfrow = c(1,3), las = 1)
for(i in 1:3){
rn <- range(c((pc_km_cnbp_combine$scores_em_train_mean[,i] - 2*(pc_km_cnbp_combine$scores_em_train_sd) ),
(pc_km_cnbp_combine$scores_em_train_mean[,i] + 2*(pc_km_cnbp_combine$scores_em_train_sd) )
))
plot(pc_km_cnbp_combine$pca$x[,i], pc_km_cnbp_combine$scores_em_train_mean[,i], main = paste0('PC ',i),
xlab = 'observed', ylab = 'predicted', ylim = rn, pty = 'n')
segments(pc_km_cnbp_combine$pca$x[,i], (pc_km_cnbp_combine$scores_em_train_mean[,i] - 2*(pc_km_cnbp_combine$scores_em_train_sd) ),
pc_km_cnbp_combine$pca$x[,i], (pc_km_cnbp_combine$scores_em_train_mean[,i] + 2*(pc_km_cnbp_combine$scores_em_train_sd) ),
col = makeTransparent('black', 50)
)
points(pc_km_cnbp_combine$pca$x[,i], pc_km_cnbp_combine$scores_em_train_mean[,i], pch = 20, col = 'black')
abline(0,1)
}
pca_km_tsSparkPlot(pc_km_cnbp_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'CNBP test predictions', transp = 100)
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',100), lwd = 2,
main = "Train",
ylab = 'CNBP (GtC/yr)')
matlines(years_trunc, t(pc_km_cnbp_combine$Ypred_train), lty = 'solid', col = makeTransparent('red',100), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
matplot(years_trunc, t(Y_combine[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',100), lwd = 2,
main = 'Test',
ylab = 'CNBP (GtC/yr)')
matlines(years_trunc, t(pc_km_cnbp_combine$Ypred_test), lty = 'solid', col = makeTransparent('red',100), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
plot(Y_combine[train_ix, ], pc_km_cnbp_combine$Ypred_train, pch = 20, col = makeTransparent('black', 100),
xlim = c(-50, 180), ylim = c(-50, 180))
points(Y_combine[test_ix, ], pc_km_cnbp_combine$Ypred_test, col = makeTransparent('red', 100), pch = 20)
abline(0,1)
par(mfrow = c(1,2))
npp_all_col <- c(rep(makeTransparent('black',100), nrow(npp_ens_wave00)), rep(makeTransparent('red',100), nrow(npp_ens_wave01)) )
npp_all <- rbind(npp_ens_wave00, npp_ens_wave01)
npp_anom_all <- anomalizeTSmatrix(npp_all, 1:20)
matplot(years, t(npp_all), type = 'l', lty = 'solid', col = npp_all_col, ylim = c(-10, 180), lwd =2,
ylab = 'NPP (GtC/yr)')
matplot(years, t(npp_anom_all), type = 'l', lty = 'solid', col = npp_all_col, ylim = c(-20, 45), lwd =2,
ylab = 'NPP anomaly (GtC/yr)')
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(npp_ens_wave01[, years_ix], iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
# WARNING - if nothing is removed, nothing is combined
Y_combine <- rbind(npp_ens_wave00[-kill_ix_wave00, years_ix], npp_ens_wave01[-kill_ix_wave01, years_ix])
Y_remove <- rbind(npp_ens_wave00[kill_ix_wave00, years_ix], npp_ens_wave01[kill_ix_wave01, years_ix])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = makeTransparent('black',100),
lwd = 2, main = 'retained runs',
ylab = 'NPP (GtC/yr)')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs', ylab = 'NPP (GtC/yr)')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_npp_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
pca_km_tsSparkPlot(pc_km_npp_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'NPP test predictions', transp = 100)
npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00[, years_ix], 1:20)
npp_anom_ens_wave01 <- anomalizeTSmatrix(npp_ens_wave01[, years_ix], 1:20)
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(npp_anom_ens_wave01, iqrThres = 5, madThres = 3)
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
# WARNING - if nothing is removed, nothing is combined
Y_combine <- rbind(npp_anom_ens_wave00[-kill_ix_wave00, ], npp_anom_ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(npp_anom_ens_wave00[kill_ix_wave00, ], npp_anom_ens_wave01[kill_ix_wave01, ])
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
main = 'removed runs')
testprop <- 0.2
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km_npp_anom_combine <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
pca_km_tsSparkPlot(pc_km_npp_anom_combine, nr = 8, nc = 20, yrs = years_trunc, maintext = 'NPP anomaly test predictions', transp = 100)
varnames <- c("baresoilFrac_lnd_mean",
"c3PftFrac_lnd_mean",
"c4PftFrac_lnd_mean",
#"cnbp",
"cSoil",
"cVeg",
"fHarvest_lnd_sum",
"fLuc_lnd_sum",
"lai_lnd_mean",
"nbp",
"npp",
"rh_lnd_sum",
"shrubFrac_lnd_mean",
"treeFrac_lnd_mean")
kill_ix_wave00 <- c(which(X[, 'f0_io'] > 0.9 | X[, 'b_wl_io'] < 0.15 ))
testprop <- 0.2
for(i in 1:length(varnames)){
varname <- varnames[i]
ens_wave00 <- get(paste0(varname, '_ens_wave00'))[, years_ix]
ens_wave01 <- get(paste0(varname, '_ens_wave01'))[ , years_ix]
# remove those iqrThres times outside the IQR
kill_ix_wave01 <- findAnomaliesTSLocal(ens_wave01, iqrThres = 5, madThres = 3)
if(identical(kill_ix_wave01, integer(0))){
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train)
Y_combine <- rbind(ens_wave00[-kill_ix_wave00, ], ens_wave01)
Y_remove <- rbind(ens_wave00[kill_ix_wave00, ])
}
else{
X_combine <- rbind(X[-kill_ix_wave00, ], X_wave01_train[-kill_ix_wave01, ])
Y_combine <- rbind(ens_wave00[-kill_ix_wave00, ], ens_wave01[-kill_ix_wave01, ])
Y_remove <- rbind(ens_wave00[kill_ix_wave00, ], ens_wave01[kill_ix_wave01, ])
}
#par(mfrow = c(1,2), oma = c(0.1, 0.1, 5, 0.1))
#matplot(years_trunc, t(Y_combine), type = 'l', lty = 'solid', col = 'black', main = 'retained runs')
#matplot(years_trunc, t(Y_remove), type = 'l', lty = 'solid',
#col = c(rep('black', length(kill_ix_wave00)), rep('red', length(kill_ix_wave01))),
#main = 'removed runs')
#mtext(side = 3, text = varname, outer = TRUE, line = 3, cex = 2)
outname <- paste0('pc_km_', varname)
nruns <- nrow(Y_combine)
ntest <- floor(nruns * testprop)
ntrain <- (nruns - ntest)
train_ix <- 1:ntrain
test_ix <- (ntrain+1):nruns
pc_km <- pca_km(X = X_combine,
Y_combine,
train_ix = train_ix,
test_ix = test_ix,
npctol = 0.99,
scale = TRUE,
center = TRUE)
assign(outname, pc_km)
pca_km_tsSparkPlot(get(outname), nr = 8, nc = 20, yrs = years_trunc, maintext = varname, transp = 100)
}