Visualising a large perturbed parameter ensemble of the land surface model JULES

This notebook finds the input space of the land surface model JULES that matches some basic observations of the global carbon cycle, at the start of the 2st Century. The ensemble is 500 members, driven with a global reanalysis that covers the 20th Century.

Load helper functions

knitr::opts_chunk$set(fig.path = "figs/")
# load some helper functions
source('es_ppe_ii_viz.fun.R')
source('../per_pft.R')

Load data and calculate some useful constants

years = 1861:2014
ysec = 60*60*24*365
norm.vec = c(1e12, 1e12, 1e12/ysec , 1e12, 1e12, 1e9)

# Load up the data
lhs_i = read.table('data/ES_PPE_i/lhs_u-ao732.txt', header = TRUE)
lhs_ii = read.table('data/ES_PPE_ii/lhs_u-ao732a.txt', header = TRUE)

toplevel.ix = 1:499

# The raw input data is a latun hypercube
lhs = rbind(lhs_i, lhs_ii)[toplevel.ix, ]

X = normalize(lhs)
colnames(X) = colnames(lhs)
d = ncol(X)
# lower and higher bound on the normalised matrix for visualisation
rx = rbind(rep(0,32), rep(1,32))

fnallvec = dir('data/ES_PPE_ii_test/', pattern = 'Annual')
# WARNING - hard coded hack to sort
fidx = grep("Annual.(?!Amazon).*", fnallvec, perl=TRUE)
fnvec_interim = fnallvec[fidx]
fidx2 = grep("sum.(?!standard).*", fnvec_interim, perl=TRUE)
fnvec = fnvec_interim[fidx2]
fnlocvec = paste0('data/ES_PPE_ii_test/', fnvec)

# Extract the bit of the filename that describes the data
fs = lapply(fnvec,regexpr, pattern = 'Annual')
fe = lapply(fnvec,regexpr, pattern = 'global_sum.txt')

fnams = rep(NA, length(fnvec))
for(i in 1:length(fnvec)){
  fnams[i] = substr(fnvec[i], attr(fs[[1]], 'match.length')+2, fe[[i]][1]-2)
}

# Constrain on runoff first
datmat.raw = matrix(nrow = nrow(X), ncol = length(fnlocvec))
for(i in 1:length(fnlocvec)){
  dat = load_ts_ensemble(fnlocvec[i])[toplevel.ix, ]
  dat.modern = dat[ ,135:154]
  mean.modern = apply(dat.modern, 1, mean)
  datmat.raw[ , i] = mean.modern
}
colnames(datmat.raw) = fnams

# Here 'normalised' means to a convenient unit (such as Sv for runoff)
dat.norm = sweep(datmat.raw, 2, norm.vec, FUN = '/')
p = ncol(dat.norm)
head(dat.norm)

##          cs_gb       cv    gpp_gb        nbp npp_n_gb    runoff
## [1,]  862.6446 145.2564 154.74537  0.7382599 61.17287 0.7954737
## [2,]  395.0432 391.9913  96.82846  1.4587248 35.40340 0.8274601
## [3,]  647.7782 147.9553 118.90562  0.8044928 46.47053 0.9690386
## [4,] 1261.4713 477.7939 154.24656  1.9786981 46.38607 0.8356007
## [5,]  684.0237  72.3248 107.89775  0.5617951 33.47812 1.0217362
## [6,]  465.2629  14.8665  91.55046 -0.4019303 37.82214 1.0465577

This matrix shows the first few rows of the output data: soil carbon (cs_gb), vegetation carbon (cv), gross primary production (gpp_gb), net biome production (nbp), net primary production (npp_n_gb) and runoff.

hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(3,2), fg = 'darkgrey', las = 1)

hist(dat.norm[,'runoff'], col = hcol, main = 'Runoff', xlab = 'Sv')

hist(dat.norm[,'nbp'], col = hcol, main = 'NBP', xlab = 'GtC/year')

hist(dat.norm[,'cs_gb'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')

hist(dat.norm[,'cv'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')

hist(dat.norm[,'npp_n_gb'], col = hcol , main = 'NPP', xlab = 'GtC/year')

The histograms show the raw model output.

par(mfrow = c(4, 8), mar = c(1,1,2,1))
for(i in 1:d){
  plot(lhs[ ,i],dat.norm[ ,6], axes = FALSE, xlab = '', ylab = '',
       main = colnames(lhs)[i], cex.main = 0.8)
}

Here is the runoff plotted against each parameter - there are plenty of unrealistically low ensemble members.

A “level 0” constraint

A quick look at the histograms of the ensemble output show that there are a number of ensemble members with very low (close to zero) runoff. Inspecting these shows that they are highly unrealistic in lots of other ways as well (e.g. very low gpp/npp). This offers an opportunity to remove runs early on that might reduce the accuracy of an emulator - a “level 0” constraint.

# Histogram of level 0 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(3,2), fg = 'darkgrey', las = 1)

hist(dat.norm[,'runoff'], col = hcol, main = 'Runoff', xlab = 'Sv')
polygon(x = c(0.5, 100, 100, 0.5), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))

hist(dat.norm[,'nbp'], col = hcol, main = 'NBP', xlab = 'GtC/year')

hist(dat.norm[,'cs_gb'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')

hist(dat.norm[,'cv'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')

hist(dat.norm[,'npp_n_gb'], col = hcol , main = 'NPP', xlab = 'GtC/year')

Red shaded areas show the regions of model output retained in a level 0 constraint. We remove any ensemble member with runoff less than or equal to 0.5 Sv, and NBP less that -10

allix = 1:(nrow(dat.norm))
level0.ix = which(dat.norm[,'runoff'] >0.5 & dat.norm[,'nbp'] > -10)
dat.level0  = dat.norm[level0.ix, ]
X.level0 = X[level0.ix, ]
lhs.level0 = lhs[level0.ix, ]

nlevel0.ix = setdiff(allix, level0.ix)
X.nlevel0 = X[nlevel0.ix, ]
lhs.nlevel0 = lhs[nlevel0.ix, ]

# These are the ensemble members which do not run (produce NA)
na.ix = which(is.na(dat.norm[,'runoff']))
X.na = X[na.ix, ]
lhs.na = lhs[na.ix, ]

lhs.rx = round(apply(lhs,2,range),1)

mins  = apply(X.level0, 2, min)
maxes = apply(X.level0, 2, max)

par(mfrow = c(3,2))
for(i in 1:6){
  hist(dat.norm[level0.ix,i], main = fnams[i])
}

Plotting the remaining members shows that many of the members with low GPP and NPP have been removed as well.

Parallel coordinates plots of input space with a level 0 constraint

# Parallel Coordinates plot of NROY and ruled out members, level 0
#dev.new(width = 20, height = 9)
#pdf(file = 'graphics/pcp_level0.pdf', width = 20, height = 9)
par(mfrow = c(2,1), las = 2, mar = c(7,4,4,1), cex.axis = 0.8)

# This function doesn't recale the maximum and minimum of each parameter, so you can easier see where there are gaps.
parcoord.notsilly(X.level0, rx, col = makeTransparent('black', 100), ylim = c(0,1),
         main = 'level 0 NROY ensemble members', var.label = TRUE)
                  
parcoord.notsilly(X.nlevel0, rx, col = makeTransparent('black', 100), ylim = c(0,1),
         main = 'level 0 ruled out ensemble members', var.label = TRUE)
                  
parcoord.notsilly(X.na,rx, col = makeTransparent('red', 255), add = TRUE)

reset()
legend('left',
       legend = 'Did Not Run',
       inset = 0.1,
       col = 'red',
       cex = 1.2,
       lwd = 2,
       horiz = TRUE)

The parallel coordinates plot on the top show those ensemble members that are NOT Ruled Out Yet (NROY) by the “level 0” constraints. The bottom shows those that are ruled out by the constraints (black) and those ensemble members that did not run at all (red). On the bottom diagram, you can see how many ensemble members fail to run when b_wl is low, suggesting that the lower limit of that parameter is set too low. Similarly, you can see that the initial constraints rule out many members when F0_io is high and b_wl is low, suggesting that this particular combination of parameters causes problems with the modelling.

Pairs plot of ruled-out ensemble member input settings

# Pairs plot of level 0 constraint

# Pairs plot of level 0 constraint excluded members is much easier to interpret
#dev.new(width = 10, height = 10)
#pdf(file = 'graphics/pairs_nlevel0.pdf', width = 10, height = 10)
pairs(rbind(X.nlevel0, X.na), gap = 0, lower.panel = NULL,
      labels = 1:d,
      cex.lab = 0.8,
      xlim = c(0,1), ylim = c(0,1),
      col = c(rep(makeTransparent('black', 50), nrow(X.nlevel0)), rep(makeTransparent('red', 100), nrow(X.na))),
      pch = 20,
      xaxt = 'n', yaxt = 'n'
      )
par(xpd = NA)
#text(0.2, 0.6, labels = paste0(colnames(lhs)[1:5],'\n'))

legend('left', legend = paste(1:d, colnames(lhs)), cex = 0.9, bty = 'n')
mtext(side = 1, text = 'Level 0 ruled out ensemble members', cex = 2)
reset()
legend('bottom',
       legend = c('Ruled out', 'Did Not Run'),
       inset = 0.15,
       col = c(makeTransparent('black', 50),makeTransparent('red', 100)),
       cex = 1.1,
       pch = 20,
       horiz = TRUE)

A pairs plot of the ensemble members that did not run (red) or were ruled out by the level 0 constraints (black).

Level 1 constraints

# Histogram of level 1 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(3,2), fg = 'darkgrey', las = 1)

hist(dat.norm[,'runoff'], col = hcol, main = 'Runoff', xlab = 'Sv')
polygon(x = c(0.5, 100, 100, 0.5), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))
hist(dat.norm[,'nbp'], col = hcol, main = 'NBP', xlab = 'GtC/year')
polygon(x = c(-10, 100, 100, -10), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))
hist(dat.norm[,'cs_gb'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')
polygon(x = c(750, 3000, 3000, 750), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))

hist(dat.norm[,'cv'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')
polygon(x = c(300, 800, 800, 300), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
       border =  makeTransparent('tomato2'))
hist(dat.norm[,'npp_n_gb'], col = hcol , main = 'NPP', xlab = 'GtC/year')
polygon(x = c(35, 80, 80, 35), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2')
)

The histograms show the full ensemble of runs (in grey) along with the location of some initial “level 1” constraints (red zones). These are the limits that the modeller (Andy Wiltshire) is willing to tolerate in a number of carbon cycle outputs before rejecting the ensemble member as useless. Any ensemble member falling outside these limits is excluded from the ensemble before any emulator is fit.

Visualising the level 1 input space

Apply the level 1 constraints. Only 53 (or around 11 %) of the ensemble members are retained as NROY

# Apply level 1 constraints
level1.ix = which(dat.norm[,'cs_gb'] > 750 & dat.norm[,'cs_gb'] < 3000 &
  dat.norm[,'cv'] > 300 & dat.norm[,'cv'] < 800 & 
  dat.norm[,'npp_n_gb'] > 35 &
  dat.norm[,'npp_n_gb'] < 80 &
  dat.norm[,'runoff'] >0.5 &
  dat.norm[,'nbp'] > -10
  )

X.level1 = X[level1.ix, ]

# Ranges of the inputs after a level 1 constraint
rn.l1 = apply(X.level1,2, range)

nlevel1.ix = setdiff(allix, level1.ix)
X.nlevel1 = X[nlevel1.ix, ]

dat.level1 = dat.norm[level1.ix, ]

# how many ensemble members are excluded?
nrow(dat.level1)

## [1] 53

(nrow(dat.level1) / nrow(dat.norm)) * 100  # in %

## [1] 10.62124

# Parallel Coordinates plot of NROY and ruled out members, level 1

par(mfrow = c(2,1), las = 2, mar = c(7,4,4,1), cex.axis = 0.8)
parcoord.notsilly(X.level1, rx = rx, col = makeTransparent('black', 100), ylim = c(0,1), var.label = TRUE,
         main = 'level 1 NROY ensemble members')

parcoord.notsilly(X.nlevel1, rx = rx, col = makeTransparent('black', 100), ylim = c(0,1),
         main = 'level 1 ruled out ensemble members', var.label = TRUE)

This parallel coordinates plot doesn’t give so much information, but the pairs plot of the NROY ensemble members is more informative

# Pairs plot of level 1 constraint
#dev.new(width = 10, height = 10)
#pdf(file = 'graphics/pairs_level1.pdf', width = 10, height = 10)
pairs(X.level1, gap = 0, lower.panel = NULL,
      labels = 1:d,
      cex.lab = 0.8,
      xlim = c(0,1), ylim = c(0,1),
      col = makeTransparent('black', 100),
      pch = 20,
      xaxt = 'n', yaxt = 'n'
      )
par(xpd = NA)

legend('left', legend = paste(1:d, colnames(lhs)), cex = 0.9, bty = 'n')
mtext(side = 1, text = 'Level 1 NROY ensemble members', cex = 2)

Use a Gaussian process emulator to perform a level1 constraint

The emulator is built using the ensemble members not ruled out by the “level 0” constraints.

# Samples from a uniform distribution across all of input space
nsamp.unif = 100000   
X.unif = samp.unif(nsamp.unif, mins = mins, maxes = maxes)

y.unif = matrix(nrow = nsamp.unif, ncol = ncol(dat.level0))
colnames(y.unif) = colnames(dat.norm)

global.emlist = vector('list',length(fnams))

# Build an emulator for each output individually
for(i in 1:ncol(y.unif)){
  em = twoStep.glmnet(X = X.level0, y = dat.level0[,i])
  global.emlist[[i]] = em
  pred = predict(em$emulator, newdata = X.unif, type = 'UK')
  y.unif[,i] = pred$mean
}

# Find the inputs where the mean of the emulated output is 
# within the tolerable limits set by the modeller.
ix.kept = which(
  y.unif[,'cs_gb'] > 750 & y.unif[,'cs_gb'] < 3000 &
  y.unif[,'cv'] > 300 & y.unif[,'cv'] < 800 & 
  y.unif[,'npp_n_gb'] > 35 & y.unif[,'npp_n_gb'] < 80 &
  y.unif[,'runoff'] >0.5 &
  y.unif[,'nbp'] > -10)
X.kept = X.unif[ix.kept, ]

# we've removed 80% of our prior input space
(nrow(X.kept) / nsamp.unif) * 100

## [1] 18.087

ix.rejected = setdiff(1:nsamp.unif, ix.kept)
X.rejected = X.unif[ix.rejected, ]

Parallel coordinate plot of emulated members

This doesn’t tell you very much. Reordering the axes might help here - ggplot has a parallel coordinates plot function that will reorder to various criteria.

par(las = 2, cex.axis = 0.8, mfrow = c(2,1))

parcoord.notsilly(X.kept[1:1000,], col = makeTransparent('black', 5), ylim = c(0,1))
parcoord.notsilly(X.rejected[1:1000,], col = makeTransparent('black', 5), ylim = c(0,1))

Pairs plot of NROY density

This plot shows the density of emulated inputs that are retained when the level 1 constraints are applied.

blues = brewer.pal(9, 'Blues')
#pdf(file = 'graphics/pairs_dens_level0_km.pdf', width = 10, height = 10)
#dev.new(width = 10, height = 10)
par(oma = c(0,0,0,3), bg = 'white')
pairs(X.kept,
      labels = 1:d,
      gap = 0, lower.panel = NULL, xlim = c(0,1), ylim = c(0,1),
      panel = dfunc.up,
      cex.labels = 1,
      col.axis = 'white',
      dfunc.col = blues)

image.plot(legend.only = TRUE,
           zlim = c(0,1),
           col = blues,
           legend.args = list(text = 'Density of model runs matching the criteria', side = 3, line = 1),
           horizontal = TRUE
)

legend('left', legend = paste(1:d, colnames(lhs)), cex = 0.9, bty = 'n')