1 Introduction

Update of explore-JULES-ES-1p0.Rmd that uses the new packages and functions

We build and test gaussian process emulators, perform a sensitivity analyis, and constrain the input space of Jules.

Andy thinks there might be mileage in analysing the atmospheric growth, which is here. /home/h01/hadaw/Research/ES_PPE/Oct20

At the moment, this vignette is hampered by the fact that emulators are failing on a few of the outputs which represent change over the historical period. The emulator is fine for predicting absolute values in the modern period.

Andy would like to see timeseries of: cVeg, cSoil and nbp, npp in GtC and GtC/yr.

1.1 Preliminaries

Load libraries, functions and data.

# Load helper functions

knitr::opts_chunk$set(fig.path = "figs/", echo = TRUE, message = FALSE, warnings = FALSE)
# load some helper functions
source('~/brazilCSSP/code/brazil_cssp/per_pft.R') # eventually, move the relevant functions
source('explore-JULES-ES-1p0_PPE_functions.R')

# Load packages

library(RColorBrewer)
library(fields)
library(MASS)
library(DiceKriging)
library(DiceEval)
library(ncdf4)
library(ncdf4.helpers)
library(readxl)

library(foreach)

library(emtools)
library(imptools)
library(viztools)
library(julesR)

# Some pallete options
yg = brewer.pal(9, "YlGn")
ryb = brewer.pal(11, "RdYlBu")
byr = rev(ryb)
rb = brewer.pal(11, "RdBu")
br = rev(rb)
blues = brewer.pal(9, 'Blues')
cbPal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

ysec = 60*60*24*365
years <- 1850:2013

1.2 Carbon budget data

Section 2.5 in Friedlingstein et al. describes how the land carbon sink is estimated.

# Question: How closely should our model match this curve? Which output?
# (My guess is 'Total Land Carbon anomaly')
historical_carbon_budget <- read_excel('Global_Carbon_Budget_2020v1.0.xlsx', sheet = "Historical Budget", skip = 15)

par(mfrow = c(3,1))
ylim = c(-1, 6)

plot(historical_carbon_budget$Year, historical_carbon_budget$`land sink`, type = 'l', bty = 'n', ylim = ylim)

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

plot(historical_carbon_budget$Year, historical_carbon_budget$`land-use change emissions`, type = 'l', bty = 'n', ylim = ylim)

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

plot(historical_carbon_budget$Year, historical_carbon_budget$`atmospheric growth`, type = 'l', bty = 'n', ylim = ylim)

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

1.2.1 Jules output data location

ensloc <- '/project/carbon_ppe/JULES-ES-1p0_PPE/'
ensmember <- 'P0000/'
subdir <- 'stats/'

floc <- paste0(ensloc,ensmember,subdir)
fn <- 'JULES-ES-1p0_P0000_Annual_global.nc'

# test file
nc <- nc_open(paste0(floc,fn))

# What variables are in the file? 
varlist <- nc.get.variable.list(nc)


for(i in 1:length(varlist)){
  
  var <- varlist[i]
  print(var)
  print(ncatt_get(nc,var,"long_name")[[2]])
}

## [1] "nbp_lnd_sum"
## [1] "Carbon Mass Flux out of Atmosphere Due to Net Biospheric Production on Land"
## [1] "year"
## [1] "year"
## [1] "nbp_lnd_mean"
## [1] "Carbon Mass Flux out of Atmosphere Due to Net Biospheric Production on Land"
## [1] "fLuc_lnd_sum"
## [1] "Net Carbon Mass Flux into Atmosphere Due to Land-Use Change"
## [1] "fLuc_lnd_mean"
## [1] "Net Carbon Mass Flux into Atmosphere Due to Land-Use Change"
## [1] "npp_nlim_lnd_sum"
## [1] "Net Primary Production on Land as Carbon Mass Flux"
## [1] "npp_nlim_lnd_mean"
## [1] "Net Primary Production on Land as Carbon Mass Flux"
## [1] "cSoil_lnd_sum"
## [1] "carbon mass in model soil pool"
## [1] "cSoil_lnd_mean"
## [1] "carbon mass in model soil pool"
## [1] "cVeg_lnd_sum"
## [1] "carbon mass in vegetation"
## [1] "cVeg_lnd_mean"
## [1] "carbon mass in vegetation"
## [1] "landCoverFrac_lnd_sum"
## [1] "Plant Functional Type Grid Fraction"
## [1] "landCoverFrac_lnd_mean"
## [1] "Plant Functional Type Grid Fraction"
## [1] "fHarvest_lnd_sum"
## [1] "Carbon Mass Flux into Atmosphere Due to Crop Harvesting"
## [1] "fHarvest_lnd_mean"
## [1] "Carbon Mass Flux into Atmosphere Due to Crop Harvesting"
## [1] "lai_lnd_sum"
## [1] "leaf area index"
## [1] "lai_lnd_mean"
## [1] "leaf area index"
## [1] "rh_lnd_sum"
## [1] "Total Heterotrophic Respiration on Land as Carbon Mass Flux"
## [1] "rh_lnd_mean"
## [1] "Total Heterotrophic Respiration on Land as Carbon Mass Flux"
## [1] "treeFrac_lnd_sum"
## [1] "Fractional cover of each surface type"
## [1] "treeFrac_lnd_mean"
## [1] "Fractional cover of each surface type"
## [1] "c3PftFrac_lnd_sum"
## [1] "Percentage Cover by C3 Plant Functional Type"
## [1] "c3PftFrac_lnd_mean"
## [1] "Percentage Cover by C3 Plant Functional Type"
## [1] "c4PftFrac_lnd_sum"
## [1] "Percentage Cover by C3 Plant Functional Type"
## [1] "c4PftFrac_lnd_mean"
## [1] "Percentage Cover by C3 Plant Functional Type"
## [1] "shrubFrac_lnd_sum"
## [1] "Percentage Cover by Shrub"
## [1] "shrubFrac_lnd_mean"
## [1] "Percentage Cover by Shrub"
## [1] "baresoilFrac_lnd_sum"
## [1] "Bare Soil Percentage Area Coverage"
## [1] "baresoilFrac_lnd_mean"
## [1] "Bare Soil Percentage Area Coverage"
## [1] "residualFrac_lnd_sum"
## [1] "Percentage of Grid Cell That Is Land but neither Vegetation Covered nor Bare Soil"
## [1] "residualFrac_lnd_mean"
## [1] "Percentage of Grid Cell That Is Land but neither Vegetation Covered nor Bare Soil"

1.3 Extract a time series of an annual, globally aggregated variable

extractTimeseries <- function(nc, variable){
    dat <- ncvar_get(nc, variable)
    out <- dat
    out
}

makeTimeseriesEnsemble <- function(variable, nens = 499, nts = 164, cn = 1850:2013){
  
  # nens is number of ensemble members
  # nts length of timeseries
  # cn is colnames()
  datmat <- matrix(NA, nrow = nens, ncol = nts)
  colnames(datmat) <- cn
  
  enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
  floc <- paste0(ensloc,ensmember,subdir)
  
  for(i in 1:nens){
    
    vec <- rep(NA,nts)
    
    ensmember <- enslist[i] 
    
    fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
 
    
    try(nc <- nc_open(paste0(fn)))
    try(dat <- extractTimeseries(nc, variable))
    
    datmat[i, ] <- dat
    nc_close(nc)
  }
  datmat
}

1.4 Plot timeseries of aggregated global timeseries

par(mfrow= c(1,4), las = 1)

plotcol <- c(makeTransparent('black', 70))


ylim = range(c(npp_ens[,1], npp_ens[,164]))
matplot(years, t(npp_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC/year', main = 'NPP',
        bty = 'n')

ylim = range(c(cSoil_ens[,1], cSoil_ens[,164]))
matplot(years, t(cSoil_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Soil Carbon',
        bty = 'n')

ylim = range(c(cVeg_ens[,1], cVeg_ens[,164]))
matplot(years, t(cVeg_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Vegetation Carbon',
        bty = 'n')

ylim = range(total_land_carbon_ens)
matplot(years, t(total_land_carbon_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol, 
        ylab = 'GtC', main = 'Total Land Carbon',
        bty = 'n')

par(mfrow= c(1,4), las = 1)

ylim = range(lai_lnd_mean_ens)
matplot(years, t(lai_lnd_mean_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'LAI', main = 'Leaf area index',
        bty = 'n')

ylim = range(treeFrac_lnd_mean_ens)
matplot(years, t(treeFrac_lnd_mean_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'fraction', main = 'Tree fraction',
        bty = 'n')

ylim = range(shrubFrac_lnd_mean_ens)
matplot(years, t(shrubFrac_lnd_mean_ens ), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'fraction', main = 'Shrub fraction',
        bty = 'n')

ylim = range(baresoilFrac_lnd_mean_ens)
matplot(years, t(baresoilFrac_lnd_mean_ens ), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'fraction', main = 'bare soil fraction',
        bty = 'n')

par(mfrow= c(1,3), las = 1)

ylim = c(-10, 10)
matplot(years, t(nbp_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol, 
        ylab = 'GtC/year', main = 'NBP',
        bty = 'n')
  
ylim = range(rh_lnd_sum_ens)
matplot(years, t(rh_lnd_sum_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'fraction', main = 'Heterotrophic Respiration',
        bty = 'n')

ylim = range(fLuc_lnd_sum_ens)
matplot(years, t(fLuc_lnd_sum_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC/year', main = 'Land use change emissions',
        bty = 'n')

ylim = range(fHarvest_lnd_sum_ens)
matplot(years, t(fHarvest_lnd_sum_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC/year', main = 'Harvest C flux to atmosphere',
        bty = 'n')

npp_ens_anom <- anomalizeTSmatrix(npp_ens, 1:20)
nbp_ens_anom <- anomalizeTSmatrix(nbp_ens, 1:20)
cSoil_ens_anom <- anomalizeTSmatrix(cSoil_ens, 1:20)
cVeg_ens_anom <- anomalizeTSmatrix(cVeg_ens, 1:20)
total_land_carbon_anom <- anomalizeTSmatrix(total_land_carbon_ens, 1:20)

par(mfrow= c(1,4), las = 1)

# ylim = range(-10, 10)
#matplot(years, t(nbp_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
#        ylab = 'NBP sum Anomaly', main = 'NBP Anomaly',
#        bty = 'n')

ylim = range(c(npp_ens_anom[,1], npp_ens_anom[,164]))
matplot(years, t(npp_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'NPP sum Anomaly', main = 'NPP Anomaly',
        bty = 'n')

ylim = range(c(cSoil_ens_anom[,1], cSoil_ens_anom[,164]))
matplot(years, t(cSoil_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Soil Carbon Anomaly',
        bty = 'n')

ylim = range(c(cVeg_ens_anom[,1], cVeg_ens_anom[,164]))
matplot(years, t(cVeg_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Vegetation Carbon Anomaly',
        bty = 'n')

ylim = range(total_land_carbon_anom)
matplot(years, t(total_land_carbon_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Total Land Carbon Anomaly',
        bty = 'n')

1.5 Can we understand change and absolute?

1.6 Function to extract the “modern value” direct from the file (last 20 years of the timeseries)

# for each of the variables in the file, average the last 20 years as the "modern" value,
# and then place in a matrix

modernValue <- function(nc, variable, ix){
  # A basic function to read a variable and 
  # take the mean of the timeseries at locations ix
  dat <- ncvar_get(nc, variable)
  out <- mean(dat[ix])
  out
}
#144:164 is the 1993:2013
modernValue(nc = nc, variable = "npp_nlim_lnd_mean", ix = 144:164)

# apply to the test file to check it works
vec <- sapply(varlist, FUN = modernValue, nc = nc, ix = 144:164)

1.7 Loop to extract the “modern value” of a number of model outputs

# Generate ensemble numbers, mean of the last 20 years of the timeseries (1994-2013)

if (file.exists("ensemble.rdata")) {
  load("ensemble.rdata")
} else {
  
  nens = 499
  datmat <- matrix(nrow = nens, ncol = length(varlist))
  colnames(datmat) <- varlist
  
  enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
  floc <- paste0(ensloc,ensmember,subdir)
  
  for(i in 1:nens){
    
    vec <- rep(NA, length(varlist))
    
    ensmember <- enslist[i] 
    
    fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
    #print(fn)
    
    try(nc <- nc_open(paste0(fn)))
    try(vec <- sapply(varlist, FUN = modernValue, nc = nc, ix = 144:164))
    datmat[i, ] <- vec
    nc_close(nc)
  }
  
  save(nens, datmat,enslist,floc, file ="ensemble.rdata")
}

1.8 Calculate an ensemble of anomalies for all variables

For each ensemble member and each variable, calculate the change from the 20 years at the start of the run, to the twenty years at the end of the run.

tsAnomaly <- function(nc, variable, startix = 1:20, endix = 144:164){

  # A basic function to read a variable and calculate the anomaly at the end of the run
  dat <- ncvar_get(nc, variable)
  endMean <- mean(dat[endix])
  startMean <- mean(dat[startix])
  out <- endMean - startMean
  out
}

tsAnomaly(nc = nc, variable = "npp_nlim_lnd_mean")

## [1] 2.410932e-09

# Generate ensemble  mean of the last 20 years of the timeseries (1994-2013)

if (file.exists("anomaly_ensemble.rdata")) {
  load("anomaly_ensemble.rdata")
} else {
  
  nens = 499
  datmatAnom <- matrix(nrow = nens, ncol = length(varlist))
  colnames(datmatAnom) <- varlist
  
  enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
  floc <- paste0(ensloc,ensmember,subdir)
  
  for(i in 1:nens){
    
    vec <- rep(NA, length(varlist))
    
    ensmember <- enslist[i] 
    
    fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
    #print(fn)
    
    try(nc <- nc_open(paste0(fn)))
    try(vec <- sapply(varlist, FUN = tsAnomaly, nc = nc))
    datmatAnom[i, ] <- vec
    nc_close(nc)
  }
  
  save(nens, datmatAnom, enslist,floc, file ="anomaly_ensemble.rdata")
}

1.9 Clean data sets to “level 0”

Initial clean of data set, removing variables that don’t change, and removing NAs (models that didn’t run).

Y_nlevel0_ix <- which(is.na(datmat[,'year']))

YAnom_nlevel0_ix <- which(is.na(datmatAnom[,'year']))

# This should be true to proceed, or we'll have to start excluding the combined set.
identical(Y_nlevel0_ix, YAnom_nlevel0_ix)

## [1] TRUE

# Y is the whole data set
Y <- datmat
# Y_level0 is the 'cleaned' data set, truncated to variables that change, and removing NAs
Y_level0 <- datmat[-Y_nlevel0_ix, -c(2,30,31)]
Y_nlevel0 <- datmat[Y_nlevel0_ix, -c(2, 30, 31)]


# Y is the whole data set
YAnom <- datmatAnom
# Y.level0 is the 'cleaned' data set, truncated to variables that change, and removing NAs
YAnom_level0 <- datmatAnom[-YAnom_nlevel0_ix, -c(2,30,31)]
YAnom_nlevel0 <- datmatAnom[YAnom_nlevel0_ix, -c(2,30,31)]

# load the original design and input space, normalize to [0-1]

# Load up the data
lhs_i = read.table('~/brazilCSSP/code/brazil_cssp/analyze_u-ao732/data/ES_PPE_i/lhs_u-ao732.txt', header = TRUE)
lhs_ii = read.table('~/brazilCSSP/code/brazil_cssp/analyze_u-ao732/data/ES_PPE_ii/lhs_u-ao732a.txt', header = TRUE)

toplevel_ix = 1:499

# The raw input data is a latin hypercube
lhs = rbind(lhs_i, lhs_ii)[toplevel_ix, ]
lhs_level0 <- lhs[-Y_nlevel0_ix,]

X = normalize(lhs)
colnames(X) = colnames(lhs)

X_level0 <- X[-Y_nlevel0_ix,]
X_nlevel0 <- X[Y_nlevel0_ix,]

d = ncol(X)
# lower and higher bound on the normalised matrix for visualisation
rx = rbind(rep(0,32), rep(1,32))

1.10 Where did the model fail to run?

(“probability of run failure” Could be an interesting project - logit transformation for probability of run failure? Some other ML like SVM?

There are clear run failure thresholds in the parameters rootd_ft_io and lai_max, and quite strong visual indications that a_wl_io and bio_hum_cn matter.

NEXT, histograms of failure with the LHS value

#simple way
par(oma = c(10,10,0,0))
#par(xpd = TRUE)
pairs(X_nlevel0, 
      xlim = c(0,1), ylim = c(0,1),
      labels = 1:d,
      col = makeTransparent('red', 150),
      gap = 0,
      pch = 20,
      xaxt ='n', yaxt = 'n',
      lower.panel = NULL)

reset()

legend('left', legend = paste(1:d, colnames(lhs)), cex = 1.2, bty = 'n')

1.11 Testing Gaussian Process emulators

1.11.1 NPP as a test

First, use mean NPP as an example. How does NPP respond to each parameter? NAs are removed, but zero values are still included.

p <- ncol(X_level0)

y_level0 <- Y_level0[,'npp_nlim_lnd_sum']

par(mfrow = c(5,7), mar = c(3,1,3,1), oma = c(1,1,5,1))
for(i in 1:p){
  plot(X_level0[,i], y_level0, xlab = '', ylab = '', main = colnames(X_level0)[i])
}
mtext('NPP', outer = TRUE, side = 3, cex = 2, line = 2)

yanom_level0 <- YAnom_level0[,'npp_nlim_lnd_sum']

par(mfrow = c(5,7), mar = c(3,1,3,1), oma = c(1,1,5,1))
for(i in 1:p){
  plot(X_level0[,i], yanom_level0, xlab = '', ylab = '', main = colnames(X_level0)[i])
}
mtext('NPP change over time', outer = TRUE, side = 3, cex = 2, line = 2)

1.12 Relationship between modern NPP and change since 1850.

Some outputs (e.g fLuc, fHarvest) have an almost perfect 1:1 relationship between modern value and change, some (nbp, npp, treeFrac) quite or moderately strong, and some (csoil, cveg) very weak or non-existant.

par(mfrow = c(4,7), mar = c(3,1,3,1), oma = c(4,5,1,1))

pdash <- ncol(Y_level0)

for(i in 1:pdash){
  
  y_level0 <- Y_level0[,i]
  yanom_level0 <- YAnom_level0[,i]
  
  plot(y_level0, yanom_level0, xlab = '', ylab = '', main = colnames(Y_level0)[i])
}

mtext(text = 'Modern value', side = 1, line = 2, outer = TRUE, cex = 2)
mtext(text = 'Change', side = 2, line = 2, outer = TRUE, cex = 2)

1.13 A clear threshold in the F0 parameter for NPP

It appears that this ensemble is less “clear cut” in having an output that clearly distinguishes between “failed” (or close to it), and “not failed”.

Having said that, having an F0 over a threshold seems to kill the carbon cycle, as before. Here, we’ve set a threshold of 0.9 (on the normalised scale) for F0, and we remove members of the ensemble with a larger F0 than that when we build emulators.

par(mfrow = c(2,1))
plot(lhs$f0_io, datmat[, 'npp_nlim_lnd_sum'], main = 'Multiplication factor', xlab = 'f0_io', ylab = 'NPP sum')
plot(X_level0[,'f0_io'], Y_level0[, 'npp_nlim_lnd_sum'], xlab = 'f0_io', main = 'Normalized', ylab = 'NPP sum')
abline(v = 0.9)

1.14 Level 1 constraint - remove ensemble members with F0 above threshold

The level 1 constraint removes any input with F0 greater than 0.9 (normalised), which removes many of the zero-carbon-cycle members up front. There are 424 ensmble members remaining.

This does constraint sequentially, which may not be a good idea.

level1_ix <- which(X_level0[, 'f0_io'] < 0.9)

X_level1 <- X_level0[level1_ix, ]
Y_level1 <- Y_level0[level1_ix, ]

YAnom_level1 <- YAnom_level0[level1_ix, ]

y_level1 <- Y_level1[, 'npp_nlim_lnd_sum']

1.14.1 Comparison of level 0 and level 1 emulators for NPP (modern value)

em_npp_level0 <- km(~., design = X_level0,  response = y_level0)
em_npp_level1 <- km(~., design = X_level1,  response = y_level1)

em_npp_level0_ms <- km(~., design = X_level0,  response = y_level0, multistart = 4)
em_npp_level1_ms <- km(~., design = X_level1,  response = y_level1, multistart = 4)

# Plot the regular km emulator. Doesn’t look great.

plot(em_npp_level0)

plot(em_npp_level1)

1.14.2 Leave-one-out summaries of NPP emulators

loo_npp_level0 <- leaveOneOut.km(em_npp_level0, type = 'UK', trend.reestim = TRUE)
loo_npp_level1 <- leaveOneOut.km(em_npp_level1, type = 'UK', trend.reestim = TRUE)

RMSE ( loo_npp_level0$mean, y_level0)

## [1] 10.80965

RMSE ( loo_npp_level1$mean, y_level1)

## [1] 369561.8

# How about MAE over the range
prop_mae <- function(Y, Ypred){
  # mean absolute error as a proportion of the range of output
  
  absdiff <- abs(diff(range(Y)))
  
  mae <- MAE(Y, Ypred)
  
  propmae <- (mae / absdiff) * 100
  
  propmae
  
}

prop_mae(y_level0, loo_npp_level0$mean)

## [1] 8.353159

prop_mae(y_level1, loo_npp_level1$mean)

## [1] 5.221079

# MOVE THIS TO IMPTOOLS OR EMTOOLS
# Given a km model, produce some error statistics

# It doesn't look like Multistart makes a big difference at all for NPP
errstats_npp_level0 <- kmLooStats(km = em_npp_level0)
errstats_npp_level0_ms <- kmLooStats(km = em_npp_level0_ms)

errstats_npp_level1 <- kmLooStats(km = em_npp_level1)
errstats_npp_level1_ms <- kmLooStats(km = em_npp_level1_ms)


# proportional mean absolute error
errstats_npp_level0$pmae

## [1] 8.353159

errstats_npp_level0_ms$pmae

## [1] 8.353592

errstats_npp_level1$pmae

## [1] 5.221079

errstats_npp_level1_ms$pmae

## [1] 5.220805

# DiceEval is useful but slow/awkward

# DE_em_npp_level0 <- modelFit(X = X_level0, Y = y_level0, type = 'Kriging', formula = ~. )
# DE_em_npp_level1 <- modelFit(X = X_level1, Y = y_level1, type = 'Kriging', formula = ~. )

# npp_em_cv_level0 <- crossValidation(DE_em_npp_level0, 5)
# npp_em_cv_level1  <- crossValidation(DE_em_npp_level1, 5)

1.15 Constrain first and then do sensitivity analysis?

The problem with doing constraint first is that you end up with a non-hypercube shaped input space (corners have been knocked off by constraint), which is not ideal. We might therefore want different sensitivity measures for pre- and post-constrained ensemble.

#sensvar needs to go into emtools with oaat_design
sensvar <- function(oaat_pred, n, d){
  # Calculate variance as a global sensitivity meansure
  out = rep(NA,d)
  for(i in 1:d){
    ix = seq(from = ((i*n) - (n-1)), to =  (i*n), by = 1)
    out[i] = var(oaat_pred$mean[ix])
  }
  out
}

twoStep_glmnet <- function(X, y, nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
                          REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
  # Use lasso to reduce input dimension of emulator before
  # building.
  control_list = list(trace=trace, maxit=maxit, REPORT=REPORT, factr=factr, pgtol=pgtol, pop.size=popsize)
  xvars = colnames(X)
  data = data.frame(response=y, x=X)
  colnames(data) <- c("response", xvars)
  nval = length(y)
  
  # fit a lasso by cross validation
  library(glmnet)
  fit_glmnet_cv = cv.glmnet(x=X,y=y)
  
  # The labels of the retained coefficients are here
  # (retains intercept at index zero)
  coef_i = (coef(fit_glmnet_cv, s = "lambda.1se"))@i
  labs = labels(coef(fit_glmnet_cv, s = "lambda.1se"))[[1]]
  labs = labs[-1] # remove intercept
  glmnet_retained = labs[coef_i]
  
  start_form = as.formula(paste("~ ", paste(glmnet_retained , collapse= "+")))
  m = km(start_form, design=X, response=y, nugget=nugget, parinit=parinit,
         nugget.estim=nuggetEstim, noise.var=noiseVar, control=control_list)
  
  return(list(x=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim,
              noiseVar=noiseVar, emulator=m, seed=seed, coefs=m@covariance@range.val,
              trends=m@trend.coef, meanTerms=all.vars(start_form), fit_glmnet_cv=fit_glmnet_cv))
}

twoStep_sens <- function(X, y, n=21, predtype = 'UK', nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
                        REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
  # Sensitivity analysis with twoStep emulator. 
  # Calculates the variance of the output varied one at a time across each input.
  d = ncol(X)
  X_norm <- normalize(X)
  X_oaat <- oaat_design(X_norm, n, med = TRUE)
  colnames(X_oaat) = colnames(X)
  
  twoStep_em = twoStep_glmnet(X=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim, noiseVar=noiseVar,
                              seed=seed, trace=trace, maxit=maxit,
                              REPORT=REPORT, factr=factr, pgtol=pgtol,
                              parinit=parinit, popsize=popsize)
  
  oaat_pred = predict(twoStep_em$emulator, newdata = X_oaat, type = predtype)
  
  sens = sensvar(oaat_pred = oaat_pred, n=n, d=d)
  out = sens
  out
}

if (file.exists("oat_twostep.rdata")) {
  load("oat_twostep.rdata")
} else {
oat_test <- twoStep_sens(X = X_level1, y = y_level1)


save(oat_test, file = "oat_twostep.rdata")
}

y_names_sum <- c('nbp_lnd_sum', 'fLuc_lnd_sum', 'npp_nlim_lnd_sum', 'cSoil_lnd_sum',
            'cVeg_lnd_sum', 'landCoverFrac_lnd_sum', 'fHarvest_lnd_sum',
            'lai_lnd_sum', 'rh_lnd_sum', 'treeFrac_lnd_sum', 'c3PftFrac_lnd_sum', 
            'c4PftFrac_lnd_sum', 'shrubFrac_lnd_sum', 'baresoilFrac_lnd_sum')




if (file.exists("oaat_Y.rdata")) {
  load("oaat_Y.rdata")
} else {
  
  
oat_var_sensmat_Y <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1))

for(i in 1:length(y_names_sum)){
  
  yname <- y_names_sum[i]
  y <- Y_level1[, yname]
  oat <- twoStep_sens(X = X_level1, y = y)
  oat_var_sensmat_Y[i, ] <- oat
}

save(y_names_sum, oat_var_sensmat_Y, file = "oaat_Y.rdata")
}

1.16 Initial sensitivity Matrix

It looks as though bwl_io is very influential across a number of variables, even though it didn’t appear that interesting in the parginal plots. Why is that?

# normalize the sensitivity matrix
colnames(oat_var_sensmat_Y) <- colnames(X_level1)
rownames(oat_var_sensmat_Y) <- y_names_sum

#test <- normalize(t(oat.var.sensmat))

#image(test)
#par()
heatmap(oat_var_sensmat_Y, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')

heatmap(oat_var_sensmat_Y, mar = c(10,10), scale = 'row')

normsens_Y <- normalize(t(oat_var_sensmat_Y))

par(mar = c(12,12,5,2))
image(normsens_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)

1.17 Re-examine bwl_io

A closer look at bwl_io now that the impact of f0_io has been removed shows a large number of “zero” NPP when bwl_io is at low values, which could well be the source of apparent sensitivity.

Take only higher values of bwl_io for the next round of constraints.

p <- ncol(X_level1)

y_level1 <- Y_level1[,'npp_nlim_lnd_sum']

par(mfrow = c(5,7), mar = c(3,1,3,1))
for(i in 1:p){
  plot(X_level1[,i], y_level1, xlab = '', ylab = '', main = colnames(X_level1)[i])
}

2 Remove ensemble members with low b_wl and see if the sensitivity analyses change and the emulators get better.

level1a_ix <- which(X_level0[, 'f0_io'] < 0.9 & X_level0[, 'b_wl_io'] > 0.15 )

X_level1a <- X_level0[level1a_ix, ]
Y_level1a <- Y_level0[level1a_ix,]

YAnom_level1a <- YAnom_level0[level1a_ix, ]

y_level1a <- Y_level1a[, 'npp_nlim_lnd_sum']

em_level1a <- km(~., design = X_level1a,  response = y_level1a)

# Plot the regular km emulator.

plot(em_level1a)

if (file.exists("oaat_level1a_Y.rdata")) {
  load("oaat_level1a_Y.rdata")
} else {
  
  
oat_var_sensmat_level1a_Y <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1a))

for(i in 1:length(y_names_sum)){
  
  yname <- y_names_sum[i]
  y <- Y_level1a[, yname]
  oat <- twoStep_sens(X = X_level1a, y = y)
  oat_var_sensmat_level1a_Y[i, ] <- oat
}

save(y_names_sum, oat_var_sensmat_level1a_Y, file = "oaat_level1a_Y.rdata")
}

p <- ncol(X_level1a)

y_level1a <- Y_level1a[,'npp_nlim_lnd_sum']

par(mfrow = c(5,7), mar = c(3,1,3,1))
for(i in 1:p){
  plot(X_level1a[,i], y_level1a, xlab = '', ylab = '', main = colnames(X_level1a)[i], xlim = c(0,1))
}

2.1 With b_wl truncated, we can clearly see the sensitivities change.

# normalize the sensitivity matrix
colnames(oat_var_sensmat_level1a_Y) <- colnames(X_level1a)
rownames(oat_var_sensmat_level1a_Y) <- y_names_sum

#test <- normalize(t(oat.var.sensmat))

#image(test)
#par()
heatmap(oat_var_sensmat_level1a_Y, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')

heatmap(oat_var_sensmat_level1a_Y, mar = c(10,10), scale = 'row')

normsens_level1a_Y <- normalize(t(oat_var_sensmat_level1a_Y))

par(mar = c(12,12,5,2))
image(normsens_level1a_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1a), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance across level 1a ensemble', side = 3, adj = 0, line = 2, cex = 1.8)

# This fails for "cVeg_lnd_sum"
# The result will be that some of the columns are repeated.

if (file.exists("oaat_YAnom.rdata")) {
  load("oaat_YAnom.rdata")
} else {

oat_var_sensmat_YAnom <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1))

for(i in 1:length(y_names_sum)){
  
  yname <- y_names_sum[i]
  y <- YAnom_level1[, yname]
  try(oat <- twoStep_sens(X = X_level1, y = y))
  oat_var_sensmat_YAnom[i, ] <- oat
}

save(y_names_sum, oat_var_sensmat_YAnom, file = "oaat_YAnom.rdata")

}

# normalize the sensitivity matrix
colnames(oat_var_sensmat_YAnom) <- colnames(X_level1)
rownames(oat_var_sensmat_YAnom) <- y_names_sum

#test <- normalize(t(oat.var.sensmat))

#image(test)
#par()
heatmap(oat_var_sensmat_YAnom, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')

heatmap(oat_var_sensmat_YAnom, mar = c(10,6), scale = 'row')

normsens_YAnom <- normalize(t(oat_var_sensmat_YAnom))

par(mar = c(12,12,5,2))
image(normsens_YAnom, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)

2.2 Plot both sensitivity matrices on top of one another.

It looks as though both the absolute value and the change over time are controlled by the same parameters.

par(mfrow = c(2,1), mar = c(12,12,5,2))

image(normsens_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)


image(normsens_YAnom, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)

2.3 How good are the emulators for each output?

The emulators appear to be at least capturing the broad response for all of the output variables.

First, plot the straight kriging emulators

if (file.exists("km_emulators_Y.rdata")) {
  load("km_emulators_Y.rdata")
} else {
  
  emlist_km_Y <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    yname <- y_names_sum[i]
    y <- Y_level1[, yname]
    
    em <- km(~., design = X_level1, response = y)
    emlist_km_Y[[i]] <- em
  }
  
  loolist_km_Y <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    
    loo <- leaveOneOut.km(model = emlist_km_Y[[i]], type = 'UK', trend.reestim = TRUE)
    loolist_km_Y[[i]] <- loo
  }
  
  save(emlist_km_Y,loolist_km_Y, file = "km_emulators_Y.rdata")
}

loostats_km_Y <- vector(mode = 'list', length = length(y_names_sum))

for(i in 1:length(emlist_km_Y)){
  
  loostats <- kmLooStats(emlist_km_Y[[i]])
  loostats_km_Y[[i]] <- loostats
  print(loostats$pmae)
}

## [1] 5.433077
## [1] 5.142409
## [1] 5.2209
## [1] 4.193005
## [1] 4.219396
## [1] 4.531841
## [1] 5.629425
## [1] 8.565018
## [1] 5.224583
## [1] 9.223968
## [1] 7.089
## [1] 7.78196
## [1] 6.900971
## [1] 6.929568

par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_km_Y)){
  
  y <- Y_level1[, y_names_sum[i]]
  loo <- loolist_km_Y[[i]]
  ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
  plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
       pch = 19)
  segments(x0 = y, y0 = loo$mean - (2*loo$sd)  , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
  abline(0,1)
  legend('bottomright',legend = paste('mae =',round(loostats_km_Y[[i]]$pmae,2),'%') , bty = 'n')

}

mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2) 
mtext('Modern Value (Y)', side = 3, line = 0, outer = TRUE, cex = 2)

if (file.exists("km_emulators_YAnom.rdata")) {
  load("km_emulators_YAnom.rdata")
} else {
  
  emlist_km_YAnom <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    yname <- y_names_sum[i]
    y <- YAnom_level1[, yname]
    
    em <- km(~., design = X_level1, response = y)
    emlist_km_YAnom[[i]] <- em
  }
  
  loolist_km_YAnom <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    
    loo <- leaveOneOut.km(model = emlist_km_YAnom[[i]], type = 'UK', trend.reestim = TRUE)
    loolist_km_YAnom[[i]] <- loo
  }
  
  save(emlist_km_YAnom,loolist_km_YAnom, file = "km_emulators_YAnom.rdata")
}

3 km emulators for change in variables over time

par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_km_YAnom)){
  
  y <- YAnom_level1[, y_names_sum[i]]
  loo <- loolist_km_YAnom[[i]]
  ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
  plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
       pch = 19)
  segments(x0 = y, y0 = loo$mean - (2*loo$sd)  , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
  abline(0,1)
  

}

mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2) 
mtext('Change over time (YAnom)', side = 3, line = 1, outer = TRUE, cex = 2)

Next, plot the twostep glmnet/km emulators

# Twostep glmnet emulators

if (file.exists("ts_emulators_Y.rdata")) {
  load("ts_emulators_Y.rdata")
} else {
  
  emlist_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    
    yname <- y_names_sum[i]
    y <- Y_level1[, yname]
    
    em <- twoStep_glmnet(X = X_level1, y = y)
    emlist_twoStep_glmnet_Y[[i]] <- em
  }
  
  loolist_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))
  
  for(i in 1:length(y_names_sum)){
    
    loo <- leaveOneOut.km(model = emlist_twoStep_glmnet_Y[[i]]$emulator, type = 'UK', trend.reestim = TRUE)
    loolist_twoStep_glmnet_Y[[i]] <- loo
  }
  
  
  
  save(emlist_twoStep_glmnet_Y, loolist_twoStep_glmnet_Y, file = "ts_emulators_Y.rdata")

}

# Can get numerical performance using

loostats_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))

for(i in 1:length(emlist_twoStep_glmnet_Y)){
  
  loostats <- kmLooStats(emlist_twoStep_glmnet_Y[[i]]$emulator)
  loostats_twoStep_glmnet_Y[[i]] <- loostats
  print(loostats$pmae)
}

## [1] 5.410355
## [1] 4.733929
## [1] 5.215455
## [1] 4.173501
## [1] 4.03861
## [1] 4.597044
## [1] 5.501413
## [1] 8.563912
## [1] 5.213222
## [1] 11.36184
## [1] 7.260354
## [1] 7.707121
## [1] 6.725096
## [1] 6.865819

3.1 TwoStep emulator performance for modern values

par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_twoStep_glmnet_Y)){
  
  y <- Y_level1[, y_names_sum[i]]
  loo <- loolist_twoStep_glmnet_Y[[i]]
  ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
  plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
       pch = 19)
  segments(x0 = y, y0 = loo$mean - (2*loo$sd)  , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
  abline(0,1)
  legend('bottomright',legend = paste('mae =',round(loostats_twoStep_glmnet_Y[[i]]$pmae,2),'%') , bty = 'n')

}

mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2) 
mtext('Modern Value (Y)', side = 3, line = 0, outer = TRUE, cex = 2)

4 Compare Leave-one-out statistics for the two types of emulator

We use the leave-one-out Mean Absolute Error, expressed as a percentage of the range of the output across the ensemble. We find that the twostep emulatorisn’t significantly more accurate, and is indeed less accurate for tree fraction.

km_pmae <- sapply(loostats_km_Y, '[[', 'pmae')
ts_pmae <- sapply(loostats_twoStep_glmnet_Y, '[[', 'pmae')

par(mar = c(12,4,2,1), las =1 )
plot(1:length(y_names_sum), km_pmae,
     ylim = c(0,15), pch = 19,
     axes = FALSE, xlab = '', 
     ylab = 'LOO MAE (% of range)',
     cex = 1.2)
points(1:length(y_names_sum),ts_pmae , col= 'red', pch = 19, cex = 1.2)
legend('topleft', c('km emulator', 'twoStep emulator'), pch = 19, col = c('black', 'red'), pt.cex = 1.2)
axis (2)
par(las = 2)
axis(1, at = 1:length(y_names_sum), labels = y_names_sum)

# Write an emulator wrapper

# Write a leave-one-out wrapper

# Create a list of emulator fits, for both km and twoStep methods.
# Use mclapply to build the lists
# use code from HDE package
# Make sure errors are handled adequately.


# Can we write it to use any emulator? (i.e km, twostep, something from another package?)

5 History matching and generation of new ensemble members

To use History Matching, we need to specify targets for various model outputs. We treat these targets as “observations” with an uncertainty where a model run marked as “implausible” (beyond 3sd) matches the hard boundaries previously identified by A. Wiltshire as being desirable.

Choose the centre of the (implied) uniform distribution. cs_gb.target = (3000 - 750) / 2 = 1125 cv.target = (800 - 300) / 2 = 250 npp_n_gb.target = (80 - 35) / 2 = 22.5

nbp.target = 0 (gpp.target = 75) (runoff.target = 1)

(to do: visualise implausibility of design and loo emulated values of design as histogram)

#Y.target = c(cs_gb.target, cv.target, npp_n_gb.target, runoff.target, nbp.target)

ynames_const <- c('nbp_lnd_sum', 'npp_nlim_lnd_sum', 'cSoil_lnd_sum', 'cVeg_lnd_sum')
yunits_const <- c('GtC/year', 'GtC/year', 'GtC', 'GtC')
Y_const_level1a <- Y_level1a[, ynames_const]

scalevec <- c(1e12/ysec, 1e12/ysec, 1e12, 1e12)
Y_const_level1a_scaled <- sweep(Y_const_level1a, 2, STATS = scalevec, FUN = '/' )

# This is a "normalisation vector", for making the output numbers more manageable.
#cs_gb       cv    gpp_gb        nbp npp_n_gb    runoff
norm_vec = c(1e12, 1e12, 1e12/ysec , 1e12, 1e12, 1e9)

# nbp  npp  csoil  cveg
Y_lower <- c(-10, 35, 750, 300)
Y_upper <- c(10, 80, 3000, 800)

# I'm going to set it so that + 4sd aligns approximately with the original limits
# given by Andy Wiltshire. This gives room for uncertainty from the emulator
Y_target = Y_upper - (abs(Y_upper - (Y_lower)) / 2 )# abs() to fix the problem with negative numbers


# standard deviation is derived from the limits and the central target
# (this distance is assumed to be 4 standard deviations.
Y_sd = (Y_upper - Y_target) / 4
names(Y_sd) = colnames(Y_const_level1a_scaled)


p = ncol(Y_const_level1a_scaled)

obs_sd_list = as.list(rep(0.01,p))
disc_list =  as.list(rep(0,p)) 
disc_sd_list =  as.list(Y_sd)
thres = 3

mins_aug = apply(X_level1a, 2, FUN = min)
maxes_aug =apply(X_level1a, 2, FUN = max)

# convert Y_target for ingestion into function
Y_target = matrix(Y_target, nrow = 1)

5.1 visualise the constraints

# Histogram of level 1 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(2,2), fg = 'darkgrey', las = 1)



hist(Y_const_level1a_scaled[,'nbp_lnd_sum'], col = hcol, main = 'NBP', xlab = 'GtC/year')
polygon(x = c(-10, 100, 100, -10), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))

hist(Y_const_level1a_scaled[,'npp_nlim_lnd_sum'], col = hcol , main = 'NPP', xlab = 'GtC/year')
polygon(x = c(35, 80, 80, 35), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))

        hist(Y_const_level1a_scaled[,'cSoil_lnd_sum'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')
polygon(x = c(750, 3000, 3000, 750), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))

hist(Y_const_level1a_scaled[,'cVeg_lnd_sum'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')
polygon(x = c(300, 800, 800, 300), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
       border =  makeTransparent('tomato2'))

5.2 Timeseries that match the constraints.

Plot the original ensemble, plus the stuff that matches the constraints. Include anomalies.

# remove to level 1a Relative to toplevel_ix
toplevel_to_level_1a_ix <-(toplevel_ix[-Y_nlevel0_ix])[level1a_ix]

# So further constraining to level 2 can be associated back to the top level.

level2_ix = which(Y_const_level1a_scaled[,'nbp_lnd_sum'] > -10 &
                    Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] > 35 &  Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] < 80 &
                    Y_const_level1a_scaled[,'cSoil_lnd_sum'] > 750 & Y_const_level1a_scaled[,'cSoil_lnd_sum'] < 3000 &
                  Y_const_level1a_scaled[,'cVeg_lnd_sum'] > 300 & Y_const_level1a_scaled[,'cVeg_lnd_sum'] < 800
  )

We find that adding a constraint to level 2 constrains the behaviour of the soil carbon pool, but not the vegetation carbon pool. Probably not a surprise, given the Soil Carbon pool (and changes) are much larger than the vegetation carbon pool.

#cSoil_ens_anom [toplevel_to_level_1a_ix, ]

par(mfrow= c(1,4), las = 1)

plotcol <- c(makeTransparent('black', 70))
overplotcol <- c(makeTransparent('tomato3', 200))


ylim = range(cSoil_ens)
matplot(years, t(cSoil_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'cSoil',
        bty = 'n')
matlines( years, t((cSoil_ens[toplevel_to_level_1a_ix, ])[level2_ix, ]),col = overplotcol,  lwd = 1.5, lty = 'solid')

ylim = range(cSoil_ens)
matplot(years, t(cVeg_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'cVeg',
        bty = 'n')
matlines( years, t((cVeg_ens[toplevel_to_level_1a_ix, ])[level2_ix, ]), col = overplotcol, lwd = 1.5, lty = 'solid')



ylim = range(cSoil_ens_anom)
matplot(years, t(cSoil_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'cSoil Anomaly',
        bty = 'n')
matlines( years, t((cSoil_ens_anom[toplevel_to_level_1a_ix, ])[level2_ix, ]),col = overplotcol, lwd = 1.5, lty = 'solid')


ylim = range(cSoil_ens_anom)
matplot(years, t(cVeg_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'cVeg Anomaly',
        bty = 'n')
matlines( years, t((cVeg_ens_anom[toplevel_to_level_1a_ix, ])[level2_ix, ]),col = overplotcol,lwd = 1.5, lty = 'solid')

5.3 Total land carbon change

We plot the JULES total land carbon change from 1850, together with “observations” from the Global Carbon Budget (Friedligstein et al, 2020). The “observations are the cumulative sum of the ‘land sink’ column on the”historical observations tab of the data from the paper. It states “The land sink is the average of several dynamic global vegetation models that reproduce the observed mean total land sink of the 1990s.”

par(mfrow= c(1,2), las = 1)

ylim = range(total_land_carbon_ens)
matplot(years, t(total_land_carbon_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Total land carbon',
        bty = 'n')
matlines( years, t((total_land_carbon_ens[toplevel_to_level_1a_ix, ])[level2_ix, ]), col = overplotcol, lty = 'solid', lwd = 1.5)

ylim = range(total_land_carbon_anom)
matplot(years, t(total_land_carbon_anom), type = 'l', lty = 'solid',ylim = ylim, col = plotcol,
        ylab = 'GtC', main = 'Total land carbon anomaly',
        bty = 'n')
matlines( years, t((total_land_carbon_anom[toplevel_to_level_1a_ix, ])[level2_ix, ]), col = overplotcol, lty = 'solid', lwd = 1.5)

#Anomalize historical observations to 1850 - 1870
historical_total_land_carbon_obs <- cumsum(historical_carbon_budget$`land sink`)
historical_total_land_carbon_obs_anom <- historical_total_land_carbon_obs - historical_total_land_carbon_obs[which(historical_carbon_budget$Year %in% 1850:1870)]

lines(historical_carbon_budget$Year,historical_total_land_carbon_obs_anom, type = 'l',col = 'blue', lwd = 2)
legend('topleft', legend = c('Ruled out', 'Level1a constraint', 'historical'), col = c('black', overplotcol, 'blue'),
       lwd = c(1,1.5,2), lty = 'solid')

5.3.1 Land carbon Sink?

I think to make a fair comparison, we might need to take the diff of the anomaly (or just the timeseries), as the above represents the total change since 1850, and I think all the “land sink” stuff looks at the year-to-year changes.

landsink_ens <- matrix(NA, nrow = nrow(total_land_carbon_ens), ncol = ncol(total_land_carbon_ens)-1)

for(i in 1:nrow(landsink_ens)){
  
  landsink_ens[i, ] <- diff(total_land_carbon_ens[i, ])
}


ylim = range(landsink_ens)
matplot(years[-1], t(landsink_ens), type = 'l', lty = 'solid',ylim = ylim, col = plotcol, xlab = '',
        ylab = 'GtC/year', main = 'Total land carbon sink',
        bty = 'n')
matlines( years[-1], t((landsink_ens[toplevel_to_level_1a_ix, ])[level2_ix, ]), col = 'tomato3', lty = 'solid')

lines(historical_carbon_budget$Year, historical_carbon_budget$`land sink`, col = 'blue', lwd = 2, xlab = '')
legend('bottomleft', legend = c('Ruled out','level 1a constrained','historical'), col = c('black','tomato3', 'blue' ), lty ='solid',
       lwd = c(1,1,2))

5.4 Augment the design.

The function addNroyDesignPoints builds an emulator for each model output in Y. It compares the output of each emulator at a number of candidate desin points, and chooses a space-filling set of them that that are Not Ruled Out Yet (statistically close to the observation at Y_target).

# Final output needs to be expressed in terms of original LHS, then put back out to conf files.

# This function adds n.aug potential design points, and finds their implausibility
# score in X.nroy

wave1 = addNroyDesignPoints(X = X_level1a, 
                            Y = Y_const_level1a_scaled, 
                            Y_target = Y_target,
                            n_aug = 50000, 
                            mins_aug = mins_aug,
                            maxes_aug = maxes_aug,
                            thres = 3,
                            disc_list=disc_list,
                            disc_sd_list = disc_sd_list,
                            obs_sd_list = obs_sd_list,
                            n_app = 500,
                            nreps = 500)

5.5 Write the augmented design

The function write_jules_design here simply takes the points calculated by addNroyDesignPoints and writes them to configuration files.

# this needs sorting
source('~/myRpackages/julesR/vignettes/default_jules_parameter_perturbations.R')

# Easiest way to generate a design of the right size is to have a "fac" which takes
# the names from the parameter list, and then multiplies everything by 0.5 or 2

tf <- 'l_vg_soil'
# we don't want anything that is TRUE/FALSE to be in fac
fac_init <- names(paramlist)
not_tf_ix <- which(names(paramlist)!=tf)
paramlist_trunc <-paramlist[not_tf_ix]

fac <- names(paramlist_trunc)

maxfac <-lapply(paramlist_trunc,function(x) x$max[which.max(x$max)] / x$standard[which.max(x$max)])
minfac <- lapply(paramlist_trunc,function(x) x$min[which.max(x$max)] / x$standard[which.max(x$max)])

# create a directory for the configuration files
confdir <- 'conf_files_augment_JULES-ES-1p0'

dir.create(confdir)

## Warning in dir.create(confdir): 'conf_files_augment_JULES-ES-1p0' already exists

X_mm <- wave1$X_mm

# This is the function that writes the configuration files.
write_jules_design(X_mm = X_mm, paramlist=paramlist, n = nrow(X_mm),
                    fac = fac, minfac = minfac, maxfac = maxfac,
                    tf = tf,
                    fnprefix = paste0(confdir,'/param-perturb-P'),
                    lhsfn = paste0(confdir,'/lhs_example.txt'),
                    stanfn = paste0(confdir,'/stanparms_example.txt'),
                    allstanfn = paste0(confdir,'/allstanparms_example.txt'),
                    rn = 12,
                    startnum = 500)

5.6 Check the design

Check that the augmented design produces what we expect. New ensemble members should be somewhat constrained within the boundaries of the original design, if the comparison to data offers any constraint.

X_mm <- wave1$X_mm

pairs(rbind(X, X_mm), xlim = c(0,1), ylim = c(0,1), gap = 0, lower.panel = NULL, 
      col = c(rep('grey', nrow(X)), rep('red', nrow(X_mm))),
      pch = c(rep(21, nrow(X)), rep(20, nrow(X_mm)))
      )

par(xpd = NA)

legend('bottom',
       legend = c('Original design', 'New points'),
       col = c('grey', 'red'),
       inset = 0.15,
       cex = 1.5,
       pch = c(21,20)
)

Visualise the predicted outputs at the NROY points of the old design, and the new suggested design.

Y_mm_list <- vector(mode ='list', length = length(wave1$fit_list))

for(i in 1:length(wave1$fit_list)){

y_mm <- predict(object=wave1$fit_list[[i]], newdata = wave1$X_mm, type = 'UK')

Y_mm_list[[i]] <- y_mm

}

5.7 Visualising the emulated outputs at the proposed new design points.

Interestingly, these aren’t perfectly within the original hard boundaries set by Andy, even though I’ve set those boundaries to be the +- 4 standard deviation threholds in the History Match. I suggest this is because there is model discrepancy, and that there is considerable wriggle room induced from emulator uncertainty.

In particular, it appears that vegetation carbon is difficult to keep high, and that many NROY proposed members have a fairly low vegetation carbon. This might need a discrepancy term, or adjusting in some other way. It certainly needs exploring, and a OAAT plot might give clues as to the parameters to choose.

# Histogram of level 1 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(2,2), fg = 'darkgrey', las = 1)

discsd <- c(disc_sd_list, recursive = TRUE)

hist(Y_const_level1a_scaled[,'nbp_lnd_sum'], col = hcol, main = 'NBP', xlab = 'GtC/year')
hist(Y_mm_list[[1]]$mean, add = TRUE, col = 'black')
polygon(x = c(-10, 100, 100, -10), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))
rug(Y_target[1], col = 'black')
rug(c(Y_target[1] + 3*discsd[1],Y_target[1] - 3*discsd[1]) , col = 'red')

## Warning in rug(c(Y_target[1] + 3 * discsd[1], Y_target[1] - 3 * discsd[1]), :
## some values will be clipped

hist(Y_const_level1a_scaled[,'npp_nlim_lnd_sum'], col = hcol , main = 'NPP', xlab = 'GtC/year')
hist(Y_mm_list[[2]]$mean, add = TRUE, col = 'black')
polygon(x = c(35, 80, 80, 35), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))
rug(Y_target[2], col = 'black')
rug(c(Y_target[2] + 3*discsd[2],Y_target[2] - 3*discsd[2]) , col = 'red')

hist(Y_const_level1a_scaled[,'cSoil_lnd_sum'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')
hist(Y_mm_list[[3]]$mean, add = TRUE, col = 'black')
polygon(x = c(750, 3000, 3000, 750), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
        border = makeTransparent('tomato2'))
rug(Y_target[3], col = 'black')
rug(c(Y_target[3] + 3*discsd[3],Y_target[3] - 3*discsd[3]) , col = 'red')

hist(Y_const_level1a_scaled[,'cVeg_lnd_sum'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')
hist(Y_mm_list[[4]]$mean, add = TRUE, col = 'black')
polygon(x = c(300, 800, 800, 300), y = c(0, 0, 1000, 1000),
        col = makeTransparent('tomato2'),
       border =  makeTransparent('tomato2'))
rug(Y_target[4], col = 'black')
rug(c(Y_target[4] + 3*discsd[4],Y_target[4] - 3*discsd[4]) , col = 'red')

How good are the four emulators that we’ve built? Are there biases? (there’s no real evidence of this)

par(mfrow = c(2,2))

for(i in 1:4){
  
hist(wave1$pred_list[[i]]$mean, main = colnames(Y_const_level1a_scaled)[i])
  
}

5.8 One-at-a-time sensitivity analysis for understanding model response

It’s obviously hard to maintain a high vegetation carbon in particular. What parameter values might you choose to do this, and what might be the trade-offs you have to make?

X_oaat_level1a <- oaat_design(X_level1a, n=21, med = TRUE)
colnames(X_oaat_level1a) = colnames(X)

y_oaat <- predict.km(wave1$fit_list[[4]], newdata = X_oaat_level1a, type = 'UK')

First, what parameters affect vegetation carbon and how? How sure are we about that?

oaatLinePlot(X_oaat = X_oaat_level1a, y_oaat_mean = y_oaat$mean, y_oaat_sd = y_oaat$sd, 
             n_oaat = 21,nr = 6, nc = 6)

Y_oaat_const_level1a_scaled <- matrix(ncol = ncol(Y_const_level1a_scaled), nrow = nrow(X_oaat_level1a))

for(i in 1:ncol(Y_const_level1a_scaled)){

  y_oaat <- predict.km(wave1$fit_list[[i]], newdata = X_oaat_level1a, type = 'UK')
  Y_oaat_const_level1a_scaled[,i] <- y_oaat$mean
}

What might be the trade-offs for a high (or accurate) vegetation carbon? are they acceptable? Plot the oaat sensitivity of the other 3 outputs we’re calibrating on.

Y_oaat_const_level1a_scaled_norm <- normalize(Y_oaat_const_level1a_scaled)

oaatLinePlotMulti(X_oaat = X_oaat_level1a, Y_oaat = Y_oaat_const_level1a_scaled_norm ,  n_oaat = 21, nr = 6, nc = 6,
                  lwd = 2)

reset()
legend('top', c('nbp', 'npp', 'csoil', 'cveg'), col = c(1,2,3,4), lty = 'solid', lwd = 2, horiz = TRUE)

What do the emulators make of the design points which actually make Andy’s “hard boundary” criteria? If we leave them out, do they still place the output within the hard boundaries?

aw_boundary_ix = which(Y_const_level1a_scaled[,'nbp_lnd_sum'] > -10 &
                    Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] > 35 &  Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] < 80 &
                    Y_const_level1a_scaled[,'cSoil_lnd_sum'] > 750 & Y_const_level1a_scaled[,'cSoil_lnd_sum'] < 3000 &
                  Y_const_level1a_scaled[,'cVeg_lnd_sum'] > 300 & Y_const_level1a_scaled[,'cVeg_lnd_sum'] < 800
  )



X_aw_boundary = X_level1a[aw_boundary_ix, ]
Y_aw_boundary = Y_const_level1a_scaled[aw_boundary_ix, ]

5.9 Check the emulators that produce the new design

Do a leave-one-out cross validation of points inside the hard boundaries using the wave1 fits.

# quickest to find the loo predictions at the right indices

loo_mean_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
loo_sd_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
  
  for(i in 1:ncol(Y_const_level1a_scaled)){

loo <- leaveOneOut.km(wave1$fit_list[[i]],type = 'UK', trend.reestim = TRUE )
  loo_mean_Y_level1a[,i] <- loo$mean
  loo_sd_Y_level1a[,i] <- loo$sd
}

We see in the leave-one-out analysis that the emulator is consistently under-predicting the vegetation carbon (though the uncertainty estimate often covers the actual value).This suggests (1) that there isn’t really a huge problem with a model discrepancy (or at least that isn’t the only problem), and (2) the history matching is working as it should, and taking into account a not-great emulator.

par(mfrow = c(2,2))

for(i in 1:ncol(loo_mean_Y_level1a)){
  
  rn <- range(c(loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i]) , loo_mean_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_Y_level1a[aw_boundary_ix,i]) ))
  
  
  
  plot(Y_const_level1a_scaled[aw_boundary_ix, i], loo_mean_Y_level1a[aw_boundary_ix,i], ylim = rn, main = colnames(Y_const_level1a_scaled)[i], xlab = 'actual', ylab = 'predicted')
  
  segments(x0 = Y_const_level1a_scaled[aw_boundary_ix, i], y0 = loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i])  , x1 = Y_const_level1a_scaled[aw_boundary_ix, i] , y1 = loo_mean_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_Y_level1a[aw_boundary_ix,i]) , col = makeTransparent('black', 70))
  abline(0,1)
  
}

5.10 Compare the straight km with a two-step emulator

Is a two-step emulator any better at emulating those crucial points which fall within aw’s hard boundaries? First, create a list of emulator fits.

fitlist_Y_const_level1a_scaled <- vector(mode = 'list', length = ncol(Y_const_level1a_scaled))

for(i in 1:ncol(Y_const_level1a_scaled)){
  
  y <- Y_const_level1a_scaled[,i]
  fit <- twoStep_glmnet(X = X_level1a, y)
  fitlist_Y_const_level1a_scaled[[i]] <- fit

}

loo_mean_glmnet_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
loo_sd_glmnet_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
  
  for(i in 1:ncol(Y_const_level1a_scaled)){

loo <- leaveOneOut.km(fitlist_Y_const_level1a_scaled[[i]]$emulator,type = 'UK', trend.reestim = TRUE )
  loo_mean_glmnet_Y_level1a[,i] <- loo$mean
  loo_sd_glmnet_Y_level1a[,i] <- loo$sd
}

It doesn’t appear that the two-step emulator (here plotted in red) is doing any better than the regular emulator.

par(mfrow = c(2,2))

for(i in 1:ncol(loo_mean_Y_level1a)){
  
  rn <- range(c(loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i]) , loo_mean_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_Y_level1a[aw_boundary_ix,i]) ))
  
  
  
  plot(Y_const_level1a_scaled[aw_boundary_ix, i], loo_mean_Y_level1a[aw_boundary_ix,i], ylim = rn, main = colnames(Y_const_level1a_scaled)[i], xlab = 'actual', ylab = 'predicted')
  
  points(Y_const_level1a_scaled[aw_boundary_ix, i], loo_mean_glmnet_Y_level1a[aw_boundary_ix,i], col = 'red')
  
  
  segments(x0 = Y_const_level1a_scaled[aw_boundary_ix, i], y0 = loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i])  , x1 = Y_const_level1a_scaled[aw_boundary_ix, i] , y1 = loo_mean_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_Y_level1a[aw_boundary_ix,i]) , col = makeTransparent('black', 70))
  abline(0,1)
  
}

Two other things I can think of to check: 1) how about using “multistart” to choose different starting conditions for optimising the emulators and 2) Using a flat prior for the mean function rather than a linear prior.

fitlist_flatprior_Y_const_level1a_scaled <- vector(mode = 'list', length = ncol(Y_const_level1a_scaled))

for(i in 1:ncol(Y_const_level1a_scaled)){
  
  y <- Y_const_level1a_scaled[,i]
  fit <- km(formula =~1, design = X_level1a, response = y)
  fitlist_flatprior_Y_const_level1a_scaled[[i]] <- fit

}

## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~1
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  2 2 1.999244 1.698141 1.994798 1.990622 1.998154 1.778477 2 2 2 1.998084 2 2 2 1.99598 2 1.989889 1.997502 1.989483 1.98997 1.987916 1.983347 1.988424 2 1.990947 1.995643 1.994232 1.992247 2 2 2 
##   - best initial criterion value(s) :  -327.0921 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       327.09  |proj g|=       1.9295
## At iterate     1  f =       192.73  |proj g|=        1.8758
## At iterate     2  f =       154.28  |proj g|=        1.8547
## At iterate     3  f =       141.83  |proj g|=        1.8136
## At iterate     4  f =       137.35  |proj g|=         1.701
## At iterate     5  f =       135.59  |proj g|=        1.3782
## At iterate     6  f =       134.73  |proj g|=        1.2097
## At iterate     7  f =       134.63  |proj g|=       0.90336
## At iterate     8  f =       134.55  |proj g|=       0.85223
## At iterate     9  f =       134.49  |proj g|=       0.67963
## At iterate    10  f =       134.46  |proj g|=       0.68007
## At iterate    11  f =       134.45  |proj g|=        0.4125
## At iterate    12  f =       134.43  |proj g|=       0.26134
## At iterate    13  f =       134.42  |proj g|=       0.12086
## At iterate    14  f =       134.42  |proj g|=       0.02021
## At iterate    15  f =       134.42  |proj g|=      0.011412
## At iterate    16  f =       134.42  |proj g|=     0.0038673
## At iterate    17  f =       134.42  |proj g|=     0.0089302
## At iterate    18  f =       134.42  |proj g|=     0.0019943
## At iterate    19  f =       134.42  |proj g|=      0.001502
## At iterate    20  f =       134.42  |proj g|=    0.00054782
## At iterate    21  f =       134.42  |proj g|=    0.00024589
## 
## iterations 21
## function evaluations 24
## segments explored during Cauchy searches 50
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 19
## norm of the final projected gradient 0.000245887
## final function value 134.421
## 
## F = 134.421
## final  value 134.420708 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~1
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  2 2 1.999244 1.698141 1.994798 1.990622 1.998154 1.778477 2 2 2 1.998084 2 2 2 1.99598 2 1.989889 1.997502 1.989483 1.98997 1.987916 1.983347 1.988424 2 1.990947 1.995643 1.994232 1.992247 2 2 2 
##   - best initial criterion value(s) :  -1616.775 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       1616.8  |proj g|=       1.7821
## At iterate     1  f =       1515.1  |proj g|=        1.4208
## At iterate     2  f =       1483.9  |proj g|=        1.4559
## At iterate     3  f =       1448.4  |proj g|=        1.5571
## At iterate     4  f =       1429.8  |proj g|=        1.4105
## At iterate     5  f =       1422.5  |proj g|=        1.3902
## At iterate     6  f =       1416.3  |proj g|=        1.1117
## At iterate     7  f =       1414.9  |proj g|=        1.2025
## At iterate     8  f =       1414.6  |proj g|=       0.37888
## At iterate     9  f =       1414.6  |proj g|=       0.30658
## At iterate    10  f =       1414.5  |proj g|=        0.3672
## At iterate    11  f =       1414.5  |proj g|=       0.22975
## At iterate    12  f =       1414.5  |proj g|=       0.11415
## At iterate    13  f =       1414.5  |proj g|=      0.084796
## At iterate    14  f =       1414.5  |proj g|=      0.043582
## At iterate    15  f =       1414.5  |proj g|=      0.015055
## At iterate    16  f =       1414.5  |proj g|=      0.005867
## At iterate    17  f =       1414.5  |proj g|=      0.005738
## At iterate    18  f =       1414.5  |proj g|=     0.0039265
## 
## iterations 18
## function evaluations 22
## segments explored during Cauchy searches 68
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 25
## norm of the final projected gradient 0.00392653
## final function value 1414.5
## 
## F = 1414.5
## final  value 1414.500017 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~1
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  2 2 1.999244 1.698141 1.994798 1.990622 1.998154 1.778477 2 2 2 1.998084 2 2 2 1.99598 2 1.989889 1.997502 1.989483 1.98997 1.987916 1.983347 1.988424 2 1.990947 1.995643 1.994232 1.992247 2 2 2 
##   - best initial criterion value(s) :  -2754.042 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=         2754  |proj g|=       1.8846
## At iterate     1  f =         2613  |proj g|=             2
## At iterate     2  f =       2583.6  |proj g|=        1.7266
## At iterate     3  f =       2576.5  |proj g|=        1.6088
## At iterate     4  f =       2572.5  |proj g|=        1.9528
## At iterate     5  f =         2572  |proj g|=        1.9197
## At iterate     6  f =       2571.7  |proj g|=        1.8864
## At iterate     7  f =       2571.5  |proj g|=        1.8206
## At iterate     8  f =       2571.2  |proj g|=        1.4812
## At iterate     9  f =       2571.1  |proj g|=       0.95035
## At iterate    10  f =         2571  |proj g|=       0.74997
## At iterate    11  f =       2570.9  |proj g|=       0.25526
## At iterate    12  f =       2570.9  |proj g|=       0.19718
## At iterate    13  f =       2570.9  |proj g|=      0.040803
## At iterate    14  f =       2570.9  |proj g|=     0.0080466
## At iterate    15  f =       2570.9  |proj g|=     0.0028131
## 
## iterations 15
## function evaluations 23
## segments explored during Cauchy searches 56
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 27
## norm of the final projected gradient 0.00281306
## final function value 2570.93
## 
## F = 2570.93
## final  value 2570.925355 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~1
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  2 2 1.999244 1.698141 1.994798 1.990622 1.998154 1.778477 2 2 2 1.998084 2 2 2 1.99598 2 1.989889 1.997502 1.989483 1.98997 1.987916 1.983347 1.988424 2 1.990947 1.995643 1.994232 1.992247 2 2 2 
##   - best initial criterion value(s) :  -2467.253 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       2467.3  |proj g|=       1.9153
## At iterate     1  f =       2418.6  |proj g|=        1.8865
## At iterate     2  f =       2387.4  |proj g|=        1.7581
## At iterate     3  f =       2364.8  |proj g|=        1.9739
## At iterate     4  f =       2356.4  |proj g|=         1.968
## At iterate     5  f =       2350.4  |proj g|=        1.9546
## At iterate     6  f =       2342.8  |proj g|=        1.9142
## At iterate     7  f =       2331.2  |proj g|=        1.7884
## At iterate     8  f =       2324.6  |proj g|=        1.6779
## At iterate     9  f =       2319.7  |proj g|=          1.54
## At iterate    10  f =       2317.7  |proj g|=        1.4539
## At iterate    11  f =       2316.9  |proj g|=        1.3771
## At iterate    12  f =       2316.2  |proj g|=        1.2376
## At iterate    13  f =       2315.7  |proj g|=        1.2514
## At iterate    14  f =       2315.5  |proj g|=       0.78601
## At iterate    15  f =       2315.5  |proj g|=       0.46657
## At iterate    16  f =       2315.4  |proj g|=       0.53647
## At iterate    17  f =       2315.4  |proj g|=        0.4986
## At iterate    18  f =       2315.4  |proj g|=       0.15138
## At iterate    19  f =       2315.4  |proj g|=      0.076824
## At iterate    20  f =       2315.4  |proj g|=      0.067048
## At iterate    21  f =       2315.4  |proj g|=       0.04419
## At iterate    22  f =       2315.4  |proj g|=      0.042393
## At iterate    23  f =       2315.4  |proj g|=      0.025063
## At iterate    24  f =       2315.4  |proj g|=      0.035315
## At iterate    25  f =       2315.4  |proj g|=      0.011844
## At iterate    26  f =       2315.4  |proj g|=      0.012962
## At iterate    27  f =       2315.4  |proj g|=      0.014111
## At iterate    28  f =       2315.4  |proj g|=     0.0023209
## 
## iterations 28
## function evaluations 32
## segments explored during Cauchy searches 50
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 21
## norm of the final projected gradient 0.00232088
## final function value 2315.38
## 
## F = 2315.38
## final  value 2315.375071 
## converged

loo_mean_flatprior_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
loo_sd_flatprior_Y_level1a <- matrix(nrow = nrow(X_level1a), ncol = ncol(Y_const_level1a_scaled))
  
  for(i in 1:ncol(Y_const_level1a_scaled)){

loo <- leaveOneOut.km(fitlist_flatprior_Y_const_level1a_scaled[[i]],type = 'UK', trend.reestim = TRUE )
  loo_mean_flatprior_Y_level1a[,i] <- loo$mean
  loo_sd_flatprior_Y_level1a[,i] <- loo$sd
}

par(mfrow = c(2,2))

for(i in 1:ncol(loo_mean_Y_level1a)){
  
  rn <- range(c(loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i]) , loo_mean_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_Y_level1a[aw_boundary_ix,i]) ))
  
  
  
  plot(Y_const_level1a_scaled[aw_boundary_ix, i], loo_mean_Y_level1a[aw_boundary_ix,i], ylim = rn, main = colnames(Y_const_level1a_scaled)[i], xlab = 'actual', ylab = 'predicted')
  
  points(Y_const_level1a_scaled[aw_boundary_ix, i], loo_mean_flatprior_Y_level1a[aw_boundary_ix,i], col = 'red')
  
  
  segments(x0 = Y_const_level1a_scaled[aw_boundary_ix, i], y0 = loo_mean_Y_level1a[aw_boundary_ix,i] - (2*loo_sd_Y_level1a[aw_boundary_ix,i])  , x1 = Y_const_level1a_scaled[aw_boundary_ix, i] , y1 = loo_mean_flatprior_Y_level1a[aw_boundary_ix,i] + (2*loo_sd_flatprior_Y_level1a[aw_boundary_ix,i]) , col = makeTransparent('black', 70))
  abline(0,1)
  
}

6 Exploring new constraints

Input from Eddy Robertson:

“We can assume that the majority of vegetation carbon is stored in tree trunks so, the carbon density of trees is approximately (cVeg_lnd_mean / treeFrac_lnd_mean). Although I suppose it’s possible that in the PPE the shrubs have become tree-like, so it might be interesting to plot cVeg_lnd_mean versus (shrubFrac_lnd_mean + treeFrac_lnd_mean) as well. The first question is whether cVeg_lnd_mean increases linearly with treeFrac_lnd_mean. Then if this turned out to be interesting, we could produce a treeCVeg_lnd_mean output.”

#pdf(file = 'vegC_density.pdf', width = 6, height = 6)
plot(Y_level0[,'treeFrac_lnd_mean'],Y_level0[,'cVeg_lnd_mean'], main = "Vegetation (tree) carbon density, Level0",
     xlab = 'treeFrac_lnd_mean', ylab  = 'cVeg_lnd_mean' )

#dev.off()

Is the range of carbon densities narrower in the “hard boundary” set?

Y_level1a_aw <- Y_level1a[aw_boundary_ix, ]
X_level1a_aw <- X_level1a[aw_boundary_ix, ]

#pdf(file = 'vegC_density_aw.pdf', width = 6, height = 6)
plot(Y_level1a[,'treeFrac_lnd_mean'],Y_level1a[,'cVeg_lnd_mean'], main = "Vegetation (tree) carbon density",
     xlab = 'treeFrac_lnd_mean', ylab  = 'cVeg_lnd_mean' )

points( Y_level1a_aw[,'treeFrac_lnd_mean'],Y_level1a_aw[,'cVeg_lnd_mean'], pch = 19, col = 'red')
legend('topleft', legend = c('Level1a','AW constraints'), pch = c(21, 19), col = c('black', 'red'))

#   dev.off()

c_density <- Y_level1a[,'treeFrac_lnd_mean'] / Y_level1a[,'cVeg_lnd_mean']

hist(c_density)

6.1 Now that we have constraints, we can explore their effect on the output

# Pairs plot of output
y_names_mean <- c('nbp_lnd_mean', 'fLuc_lnd_mean', 'npp_nlim_lnd_mean', 'cSoil_lnd_mean',
            'cVeg_lnd_mean', 'landCoverFrac_lnd_mean', 'fHarvest_lnd_mean',
            'lai_lnd_mean', 'rh_lnd_mean', 'treeFrac_lnd_mean', 'c3PftFrac_lnd_mean', 
            'c4PftFrac_lnd_mean', 'shrubFrac_lnd_mean', 'baresoilFrac_lnd_mean')

Y_level1a_mean <- Y_level1a[,y_names_mean]

pcol <- rep('black', nrow(Y_level1a_mean))
pcol[aw_boundary_ix] <- 'red'
pairs(Y_level1a_mean, col = pcol, lower.panel = NULL, pch = 19, cex.labels = 1.5)

6.2 How much does constraining an output constrain other outputs?

For example, what effect has the AW “hard boundary” constraint produced on ranges of the other (marginal) outputs? Which constraint is doing the majority of the work?

A function to find the proportion of output space that is removed (or retained) when applying a constraint.

constraint_prop <- function(Y, constraint_ix){
  
  Yconst <- Y[constraint_ix, ]
  Yconst_norm <- normalize(Yconst, wrt = Y)
  rn <- apply(Yconst_norm, 2, range)
  out <- rn
  out
  
}

Y_constraint_plot <- function(constraint_range, constraint_columns, ...){
  # A function to plot the output of constraint_prop
  
  par(mar = c(12,4,3,2),las = 2)

pcol = rep('black', ncol(constraint_range))
pcol[constraint_columns] <- 'red' # plot the outputs where we have specified the constraints in red.

plot(1:ncol(constraint_range),constraint_range[2,], ylim = c(0,1) , axes = FALSE, xlab = '', ylab = 'range of ensemble (fraction)', col = pcol, pch = 19, ...)
points(1:ncol(constraint_range), constraint_range[1,], col = pcol, pch = 19)
segments(x0 = 1:ncol(constraint_range), y0 = constraint_range[1,], x1 = 1:ncol(constraint_range), y1 = constraint_range[2,], col = pcol, lwd =2 )

axis(side = 1, labels = colnames(constraint_range), at = 1:ncol(constraint_range))
axis(2)
abline(h = c(0,1), col = 'red')
  
}

All constraints together take the normalized range of all the outputs to a little under a half of the original range, on average. Of course, four of the inputs are constrained here, which has quite a large impact.

Y_level1a_mean_aw_norm_rn <- constraint_prop(Y = Y_level1a_mean, constraint_ix = aw_boundary_ix)
mean_constraint_all <- mean(Y_level1a_mean_aw_norm_rn[2, ] - Y_level1a_mean_aw_norm_rn[1, ])
Y_constraint_plot(Y_level1a_mean_aw_norm_rn,constraint_columns = c(c(1,3,4,5)), main = paste('mean constrained range =', round(mean_constraint_all,3)))

We can see what impacts the constraints have individually

nbp_ix <- which(Y_const_level1a_scaled[,'nbp_lnd_sum'] > -10)
npp_ix <- which(Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] > 35 &  Y_const_level1a_scaled[,'npp_nlim_lnd_sum'] < 80 )
cSoil_ix <- which(Y_const_level1a_scaled[,'cSoil_lnd_sum'] > 750 & Y_const_level1a_scaled[,'cSoil_lnd_sum'] < 3000)
cVeg_ix <- which(Y_const_level1a_scaled[,'cVeg_lnd_sum'] > 300 & Y_const_level1a_scaled[,'cVeg_lnd_sum'] < 800)

The NBP constraint has no marginal impact

Y_level1a_mean_nbp_const <- constraint_prop(Y = Y_level1a_mean, constraint_ix = nbp_ix )
mean_constraint_nbp <- mean(Y_level1a_mean_nbp_const[2, ] - Y_level1a_mean_nbp_const[1, ])
Y_constraint_plot(Y_level1a_mean_nbp_const,1,  main = paste('mean constrained range =', round(mean_constraint_nbp,3)))

Y_level1a_mean_npp_const <- constraint_prop(Y = Y_level1a_mean, constraint_ix = npp_ix )
mean_constraint_npp <- mean(Y_level1a_mean_npp_const[2, ] - Y_level1a_mean_npp_const[1, ])
Y_constraint_plot(Y_level1a_mean_npp_const,3,  main = paste('mean constrained range =', round(mean_constraint_npp,3)))

Y_level1a_mean_cSoil_const <- constraint_prop(Y = Y_level1a_mean, constraint_ix = cSoil_ix )
mean_constraint_cSoil <- mean(Y_level1a_mean_cSoil_const[2, ] - Y_level1a_mean_cSoil_const[1, ])
Y_constraint_plot(Y_level1a_mean_cSoil_const,4,  main = paste('mean constrained range =', round(mean_constraint_cSoil,3)))

Vegetation carbon (cVeg) has the strongest impact on everything else. This is no suprise, given that so few of the ensemble members seem to simulate carbon vegetation high enough.

Y_level1a_mean_cVeg_const <- constraint_prop(Y = Y_level1a_mean, constraint_ix = cVeg_ix )
mean_constraint_cVeg <- mean(Y_level1a_mean_cVeg_const[2, ] - Y_level1a_mean_cVeg_const[1, ])
Y_constraint_plot(Y_level1a_mean_cVeg_const,5,  main = paste('mean constrained range =', round(mean_constraint_cVeg,3)))

# with 3 processers, using foreach parallel processing takes approximately 1/3 the time (around 20 seconds per emulator) of looping the emulator fitting directly. I'd hope this would scale with processor numbers - it'll be worth trying on SPICE.

# system.time(testloop_par <- direct_twoStep_glmnet_parallel(X=X_level1, Y = Y_level1))

6.3 Writing robust, parallelised, multistart emulators

direct_twoStep_glmnet <- function(X, Y, ...){
  
  d <- ncol(Y)
  
  out <- vector(mode='list', length=d)
  
  for(i in 1:d){
    
    y <- Y[,i]
    
    em <- twoStep_glmnet(X=X, y = y, ...)
    out[i] <- em
 
  }
  
  out
}

7 Direct twoStep emulator using foreach

library(doParallel)
registerDoParallel(cores = detectCores() - 1)

direct_twoStep_glmnet_parallel <- function(X, Y, ...){
  
  d <- ncol(Y)
  
  out <- vector(mode='list', length=d)
  
  foreach(i = 1:d) %dopar% {
    
    y <- Y[,i]
    
    em <- twoStep_glmnet(X=X, y = y, ...)
    out[i] <- em
 
  }
  
  out
}

# I actually wrote code to do this createKmFitList, which isn't parallel at the moment.
# Maybe make it parallel in the next version?

library(doParallel)
registerDoParallel(cores = detectCores() - 1)

direct_twoStep_glmnet_parallel <- function(X, Y, ...){
  
  d <- ncol(Y)
  foreach(i = 1:d) %dopar% {
    
    y <- Y[,i]
    
    em <- twoStep_glmnet(X=X, y = y, ...)
  }
}

library(doParallel)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #not to overload your computer
registerDoParallel(cl)

fitlist_test <- createKmFitListParallel(X = X_level1a, Y =  Y_const_level1a_scaled, formula = ~., multistart = 4)
stopCluster(cl)

plot(fitlist_test[[4]])

#summariseKmLoo <- function(fit){
  
#  loo <- leaveOneOut.km(fit, type = 'UK', trend.reestim = TRUE)
  
#  loo_mean <- mean(loo$)
  
  
#}

# system.time(testloop <- direct_twoStep_glmnet(X=X_level1, Y_level1))

# Test parallel processing against a foreach processing (and maybe mclapply?)

#system.time(testloop_par <- direct_twoStep_glmnet_parallel(X=X_level1, Y_level1))

emList <- function(X, Y){
  # Create a list of objects to be emulated
  # X             design matrix with (nrow) instances of (ncol) inputs
  # Y             matrix of outputs, with one row
  #               for each row of X

  d <- ncol(Y)
  em_list <- vector(mode='list', length=d)

  for(i in 1:d){
    em_obj <- NULL
    em_obj$X <- X
    em_obj$y <- Y[, i]
    em_list[[i]] <- em_obj
  }
  em_list
}

#test <- emlist(X_level1, Y_level1)

#mclapply(X = test, FUN = km)

# function from hde for direct prediction
direct.pred = function (form, X, Y, Xnew, ...){
  # Directly applies km in parallel to predict each column of an ensemble
  ens.list = emlist(X = X, Y = Y)
  km.list = mclapply(ens.list, FUN = km.wrap, form = form)

  pred.list = mclapply(km.list, FUN = km.pred.wrap, Xnew = as.matrix(Xnew, nrow = 1), type = "UK")

  out.mean = sapply(pred.list, FUN=extract.predmean)
  out.sd = sapply(pred.list, FUN=extract.predsd)
  return(list(mean = out.mean, sd = out.sd))
}

Sensitivity analysis and constraint of the parameter space of Earth System configuration of JULES

Doug McNeall

24/03/2021