Introduction

This notebook build a high-dimensional emulator of timeseries of JULES global mean carbon cycle outputs. It uses PCA to reduce the dimension
of the output, and build a Gaussian Process emulator of the output in reduced dimension. We start with NPP and NPP anomaly.

Preliminaries

# Libraries
library(DiceKriging)
# Helper functions
mat2list <- function(X){
  
  # Turns the p columns of a matrix into a p length list,
  # with each column becoming an element of the list
  
  out <- vector(mode = 'list', length = ncol(X))
  for(i in 1:ncol(X)){
    out[[i]] <- X[ , i]
  }
  out
}


reset <- function() {
  # Allows annotation of graphs, resets axes
  par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
  plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}

makeTransparent<-function(someColor, alpha=100)
  # Transparent colours for plotting
{
  newColor<-col2rgb(someColor)
  apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
                                              blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
# Load adata
load('data/ensemble-jules-historical-timeseries.RData')

NAs in the data

count_na <- function(x){
  
 sum(is.na(x))
}

count_na(npp_ens_wave00)
## [1] 0
ens_charvec <- ls(pattern = "ens_wave00")

for(i in ens_charvec){
  
  print(count_na(get(i)))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0

Split out training and test data

We use the last 20% of the runs as a test set (they are random in order)

# How about we look at post 1950? Maybe that is messing it up?
years_ix <- 101:164

train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
test_ix  <- 400:499

X_train <- X[train_ix, ]
X_test  <- X[test_ix, ]

Y_train <- npp_ens_wave00[train_ix, years_ix ]
Y_test  <- npp_ens_wave00[test_ix, years_ix]


years_trunc <- years[years_ix]

Plot the NPP timeseries training set

matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid', col = 'black',
        xlab = 'years', ylab = 'NPP (training output)',
        ylim = c(0,180))

## Perform PCA

How many PCs do we need to keep? This plot would suggest a single PC does very well.

# perform a PCA on the training output
pca <- prcomp(Y_train, center = TRUE, scale = TRUE)

plot(pca)

# How many principle components do we wish to keep? 
npc <- 3

scores <- pca$x[ ,1:npc]

# project the truncated scores back up, to see how well they reproduce the 
# original data
  
anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
tens <- t(anom + pca$center)

Perfect (truncated) scores reconstruction

Reconstruct the training data with the perfect scores and truncated number of PCs - this is the smallest error we can expect with an emulation.

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid', 
        col = 'black', xlab = 'years', ylab = 'NPP (training output)', 
        main = 'reconstruction',
        ylim = c(0,200))
matlines(years_trunc, t(tens),  lty = 'solid', col = 'red')

legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))

Y_train_err <- tens - Y_train

matplot(years_trunc, t(Y_train_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'training output',
        main = 'error',
        ylim = c(-4,4)
        )

Build an emulator for each of the PCs

scores_em_mean_test <- NULL
scores_em_sd_test <- NULL

scores_em_mean_train <- NULL

for(i in 1:npc){
  #
  y <- pca$x[,i]
  fit <- km(~., design = X_train, response = y)
  loo <- leaveOneOut.km(fit, type = 'UK', trend.reestim = TRUE)
  
  
  pred_test <- predict(fit, newdata = X_test, type = 'UK')
  
  scores_em_mean <- pred_test$mean
  scores_em_sd <- pred_test$sd
  
  scores_em_mean_test <- cbind(scores_em_mean_test, scores_em_mean)
  scores_em_sd_test   <- cbind(scores_em_sd_test, scores_em_sd)
  
  scores_em_mean_train <- cbind(scores_em_mean_train, loo$mean)
  
}
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  -1286.659 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       1286.7  |proj g|=       1.8853
## At iterate     1  f =       1248.1  |proj g|=         1.456
## At iterate     2  f =       1241.7  |proj g|=        1.9318
## At iterate     3  f =       1233.2  |proj g|=        1.9311
## ys=-1.488e+00  -gs= 7.777e+00, BFGS update SKIPPED
## At iterate     4  f =       1222.8  |proj g|=        1.8771
## At iterate     5  f =       1211.7  |proj g|=        1.7988
## At iterate     6  f =       1202.5  |proj g|=        1.7691
## At iterate     7  f =       1197.8  |proj g|=        1.8207
## At iterate     8  f =       1195.1  |proj g|=        1.7605
## At iterate     9  f =       1193.5  |proj g|=        1.8294
## At iterate    10  f =       1193.1  |proj g|=        1.7433
## At iterate    11  f =       1192.5  |proj g|=        1.0071
## At iterate    12  f =       1189.9  |proj g|=       0.89427
## At iterate    13  f =       1186.6  |proj g|=        0.9149
## At iterate    14  f =       1185.5  |proj g|=        1.8162
## At iterate    15  f =       1184.5  |proj g|=        1.8178
## At iterate    16  f =         1184  |proj g|=        1.0408
## At iterate    17  f =       1183.6  |proj g|=        1.1663
## At iterate    18  f =       1183.2  |proj g|=        1.7243
## At iterate    19  f =       1182.5  |proj g|=        1.8146
## At iterate    20  f =       1182.3  |proj g|=          1.81
## At iterate    21  f =       1182.2  |proj g|=        1.2211
## At iterate    22  f =       1182.2  |proj g|=        1.0004
## At iterate    23  f =       1181.5  |proj g|=       0.95857
## At iterate    24  f =       1181.1  |proj g|=       0.82354
## At iterate    25  f =         1181  |proj g|=        0.8134
## At iterate    26  f =         1181  |proj g|=       0.78801
## At iterate    27  f =       1180.9  |proj g|=        1.8065
## At iterate    28  f =       1180.8  |proj g|=        1.8082
## At iterate    29  f =       1180.8  |proj g|=        1.7545
## At iterate    30  f =       1180.8  |proj g|=       0.66165
## At iterate    31  f =       1180.8  |proj g|=         1.489
## At iterate    32  f =       1180.8  |proj g|=       0.54519
## At iterate    33  f =       1180.8  |proj g|=       0.57186
## At iterate    34  f =       1180.7  |proj g|=        1.5258
## At iterate    35  f =       1180.7  |proj g|=        1.7555
## At iterate    36  f =       1180.7  |proj g|=        1.6149
## At iterate    37  f =       1180.7  |proj g|=       0.66015
## At iterate    38  f =       1180.7  |proj g|=       0.46509
## At iterate    39  f =       1180.7  |proj g|=       0.25264
## At iterate    40  f =       1180.7  |proj g|=       0.25305
## At iterate    41  f =       1180.7  |proj g|=       0.25209
## At iterate    42  f =       1180.7  |proj g|=       0.16265
## At iterate    43  f =       1180.7  |proj g|=       0.20515
## At iterate    44  f =       1180.7  |proj g|=      0.013583
## At iterate    45  f =       1180.7  |proj g|=       0.01352
## 
## iterations 45
## function evaluations 56
## segments explored during Cauchy searches 60
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 26
## norm of the final projected gradient 0.0135197
## final function value 1180.69
## 
## F = 1180.69
## final  value 1180.686865 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  168.5318 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=      -168.53  |proj g|=       1.7977
## At iterate     1  f =      -169.12  |proj g|=        1.7219
## At iterate     2  f =      -181.65  |proj g|=        1.4016
## At iterate     3  f =      -184.12  |proj g|=        1.8291
## At iterate     4  f =      -185.77  |proj g|=        1.6628
## At iterate     5  f =       -188.5  |proj g|=        1.7391
## At iterate     6  f =      -189.44  |proj g|=        1.6012
## At iterate     7  f =       -193.3  |proj g|=          1.71
## At iterate     8  f =      -194.29  |proj g|=        1.7176
## At iterate     9  f =      -197.07  |proj g|=        1.7039
## At iterate    10  f =      -198.32  |proj g|=        1.6584
## At iterate    11  f =       -198.9  |proj g|=         1.639
## At iterate    12  f =      -199.68  |proj g|=        1.6895
## At iterate    13  f =      -199.95  |proj g|=        1.2701
## At iterate    14  f =       -200.6  |proj g|=        1.6119
## At iterate    15  f =      -200.99  |proj g|=       0.79384
## At iterate    16  f =       -201.3  |proj g|=        1.7058
## At iterate    17  f =      -201.58  |proj g|=        1.6921
## At iterate    18  f =      -201.97  |proj g|=       0.75943
## At iterate    19  f =      -202.48  |proj g|=       0.96771
## At iterate    20  f =      -202.61  |proj g|=       0.90947
## At iterate    21  f =      -202.78  |proj g|=        1.0436
## At iterate    22  f =      -202.96  |proj g|=       0.86211
## At iterate    23  f =      -203.08  |proj g|=       0.81763
## At iterate    24  f =      -203.13  |proj g|=       0.77121
## At iterate    25  f =      -203.16  |proj g|=       0.74209
## At iterate    26  f =       -203.3  |proj g|=        1.3186
## At iterate    27  f =      -203.32  |proj g|=       0.40967
## At iterate    28  f =      -203.38  |proj g|=        0.3241
## At iterate    29  f =      -203.43  |proj g|=       0.47016
## At iterate    30  f =      -203.46  |proj g|=       0.42563
## At iterate    31  f =      -203.49  |proj g|=        1.5172
## At iterate    32  f =      -203.51  |proj g|=       0.50401
## At iterate    33  f =      -203.52  |proj g|=       0.42811
## At iterate    34  f =      -203.53  |proj g|=       0.51492
## At iterate    35  f =      -203.53  |proj g|=       0.27638
## At iterate    36  f =      -203.53  |proj g|=       0.14328
## At iterate    37  f =      -203.54  |proj g|=       0.18107
## At iterate    38  f =      -203.54  |proj g|=      0.080715
## At iterate    39  f =      -203.54  |proj g|=      0.058844
## At iterate    40  f =      -203.54  |proj g|=      0.033909
## At iterate    41  f =      -203.54  |proj g|=      0.059998
## At iterate    42  f =      -203.54  |proj g|=       0.18704
## At iterate    43  f =      -203.54  |proj g|=      0.034582
## At iterate    44  f =      -203.54  |proj g|=       0.03858
## At iterate    45  f =      -203.54  |proj g|=       0.04233
## At iterate    46  f =      -203.54  |proj g|=      0.055205
## At iterate    47  f =      -203.54  |proj g|=       0.13286
## At iterate    48  f =      -203.54  |proj g|=      0.014659
## At iterate    49  f =      -203.54  |proj g|=      0.011585
## At iterate    50  f =      -203.54  |proj g|=      0.040125
## At iterate    51  f =      -203.54  |proj g|=      0.034438
## At iterate    52  f =      -203.54  |proj g|=      0.012412
## At iterate    53  f =      -203.54  |proj g|=     0.0075297
## At iterate    54  f =      -203.54  |proj g|=      0.018592
## At iterate    55  f =      -203.54  |proj g|=      0.012649
## At iterate    56  f =      -203.54  |proj g|=      0.021603
## At iterate    57  f =      -203.54  |proj g|=     0.0042614
## At iterate    58  f =      -203.54  |proj g|=      0.004044
## At iterate    59  f =      -203.54  |proj g|=     0.0032159
## At iterate    60  f =      -203.54  |proj g|=      0.015432
## At iterate    61  f =      -203.54  |proj g|=     0.0030054
## At iterate    62  f =      -203.54  |proj g|=     0.0038065
## At iterate    63  f =      -203.54  |proj g|=     0.0040125
## At iterate    64  f =      -203.54  |proj g|=      0.010214
## At iterate    65  f =      -203.54  |proj g|=     0.0021606
## At iterate    66  f =      -203.54  |proj g|=     0.0016152
## 
## iterations 66
## function evaluations 74
## segments explored during Cauchy searches 73
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 19
## norm of the final projected gradient 0.0016152
## final function value -203.537
## 
## F = -203.537
## final  value -203.537367 
## converged
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  686.9548 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=      -686.95  |proj g|=       1.7948
## At iterate     1  f =      -693.89  |proj g|=        1.8917
## At iterate     2  f =      -694.45  |proj g|=        1.5386
## At iterate     3  f =      -695.16  |proj g|=        1.6179
## At iterate     4  f =      -699.23  |proj g|=        1.7849
## At iterate     5  f =      -701.05  |proj g|=        1.7567
## At iterate     6  f =      -702.13  |proj g|=        1.8679
## At iterate     7  f =       -702.6  |proj g|=        1.6509
## At iterate     8  f =      -703.99  |proj g|=        1.6352
## At iterate     9  f =      -704.31  |proj g|=        1.6159
## At iterate    10  f =      -704.57  |proj g|=        1.8675
## At iterate    11  f =      -705.59  |proj g|=        1.8629
## At iterate    12  f =      -706.28  |proj g|=         1.832
## At iterate    13  f =      -706.98  |proj g|=        1.5766
## At iterate    14  f =      -707.37  |proj g|=         1.862
## At iterate    15  f =      -707.57  |proj g|=        1.5452
## At iterate    16  f =      -707.63  |proj g|=        1.8585
## At iterate    17  f =      -707.71  |proj g|=        1.5331
## At iterate    18  f =      -708.19  |proj g|=        1.7726
## At iterate    19  f =       -710.6  |proj g|=        1.7883
## At iterate    20  f =      -711.06  |proj g|=        1.7749
## At iterate    21  f =      -711.35  |proj g|=        1.8495
## At iterate    22  f =      -711.53  |proj g|=        1.8398
## At iterate    23  f =      -711.85  |proj g|=       0.89124
## At iterate    24  f =      -712.03  |proj g|=         1.232
## At iterate    25  f =      -712.52  |proj g|=        1.2038
## At iterate    26  f =      -712.58  |proj g|=         1.109
## At iterate    27  f =      -712.65  |proj g|=       0.80976
## At iterate    28  f =      -712.78  |proj g|=       0.58775
## At iterate    29  f =      -712.86  |proj g|=       0.48864
## At iterate    30  f =      -712.91  |proj g|=       0.75393
## At iterate    31  f =      -713.03  |proj g|=       0.92058
## At iterate    32  f =      -713.08  |proj g|=       0.45941
## At iterate    33  f =      -713.12  |proj g|=        1.1598
## At iterate    34  f =      -713.13  |proj g|=       0.50621
## At iterate    35  f =       -713.2  |proj g|=         1.596
## At iterate    36  f =      -713.26  |proj g|=         1.602
## At iterate    37  f =      -713.31  |proj g|=        1.5961
## At iterate    38  f =      -713.32  |proj g|=        0.9729
## At iterate    39  f =      -713.34  |proj g|=       0.34083
## At iterate    40  f =      -713.35  |proj g|=       0.62053
## At iterate    41  f =      -713.36  |proj g|=       0.42734
## At iterate    42  f =      -713.37  |proj g|=       0.37401
## At iterate    43  f =      -713.43  |proj g|=        1.5428
## At iterate    44  f =      -713.48  |proj g|=        1.5656
## At iterate    45  f =       -713.5  |proj g|=        1.8119
## At iterate    46  f =       -713.5  |proj g|=       0.33387
## At iterate    47  f =      -713.51  |proj g|=       0.24663
## At iterate    48  f =      -713.51  |proj g|=       0.37505
## At iterate    49  f =      -713.51  |proj g|=       0.35893
## At iterate    50  f =      -713.52  |proj g|=       0.19192
## At iterate    51  f =      -713.52  |proj g|=       0.62754
## At iterate    52  f =      -713.52  |proj g|=       0.29661
## At iterate    53  f =      -713.52  |proj g|=       0.25122
## At iterate    54  f =      -713.52  |proj g|=       0.08321
## At iterate    55  f =      -713.52  |proj g|=      0.061262
## At iterate    56  f =      -713.52  |proj g|=       0.10055
## At iterate    57  f =      -713.52  |proj g|=      0.082903
## At iterate    58  f =      -713.52  |proj g|=       0.17544
## At iterate    59  f =      -713.52  |proj g|=       0.10645
## At iterate    60  f =      -713.52  |proj g|=      0.062303
## At iterate    61  f =      -713.52  |proj g|=      0.074458
## At iterate    62  f =      -713.52  |proj g|=       0.16492
## At iterate    63  f =      -713.52  |proj g|=       0.14377
## At iterate    64  f =      -713.52  |proj g|=       0.04009
## At iterate    65  f =      -713.52  |proj g|=       0.13043
## At iterate    66  f =      -713.52  |proj g|=      0.039867
## At iterate    67  f =      -713.52  |proj g|=      0.048104
## At iterate    68  f =      -713.52  |proj g|=      0.053999
## At iterate    69  f =      -713.52  |proj g|=      0.064402
## At iterate    70  f =      -713.52  |proj g|=      0.086968
## At iterate    71  f =      -713.52  |proj g|=      0.055652
## At iterate    72  f =      -713.52  |proj g|=      0.083874
## At iterate    73  f =      -713.52  |proj g|=      0.031257
## At iterate    74  f =      -713.52  |proj g|=      0.017897
## At iterate    75  f =      -713.52  |proj g|=       0.01501
## At iterate    76  f =      -713.52  |proj g|=      0.029294
## At iterate    77  f =      -713.52  |proj g|=      0.045344
## At iterate    78  f =      -713.52  |proj g|=       0.11612
## At iterate    79  f =      -713.52  |proj g|=       0.02058
## At iterate    80  f =      -713.52  |proj g|=     0.0078525
## At iterate    81  f =      -713.52  |proj g|=     0.0088307
## 
## iterations 81
## function evaluations 92
## segments explored during Cauchy searches 91
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 21
## norm of the final projected gradient 0.00883074
## final function value -713.525
## 
## F = -713.525
## final  value -713.524947 
## converged
anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
tens_loo <- t(anom_loo + pca$center)

anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
tens_test <- t(anom_test + pca$center)

Predicting the PC scores

How well does the GP do in predicting the training data in the reduced dimension space?

par(mfrow = c(1,3))

for(i in 1:npc){
  
  plot( pca$x[,i], scores_em_mean_train[,i], xlab = 'observed', ylab = 'predicted', main = paste('PC', i))
  abline(0,1)
  
}

Leave-one-out prediction error

In the data space, for the training data set.

Y_loo_err <- tens_loo - Y_train

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'LOO training reconstruction',
        ylim = c(-50,200)
        )

matlines(years_trunc, t(tens_loo), 
        type = 'l',
        lty = 'solid',
        col = 'red',
        )
legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))

matplot(years_trunc, t(Y_loo_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'NPP error',
        main = 'LOO train prediction error',
        ylim = c(-100,100)
        )

Y_test_err <- tens_test - Y_test

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_test), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test',
        main = 'Test reconstruction',
        ylim = c(-50,200)
        )

matlines(years_trunc, t(tens_test), 
        type = 'l',
        lty = 'solid',
        col = 'red',
        )
legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))

matplot(years_trunc, t(Y_test_err), 
        type = 'l',
        lty = 'solid',
        col = 'black',
        xlab = 'years',
        ylab = 'test - predicted output',
        main = 'Test prediction error',
        ylim = c(-100,100)
        )

## Are “Zero Carbon” runs polluting the emulator? How about testing how good emulating the first (or last) point is? Are the “zero carbon cycle” and “failures” causing problems with the emulator?

y_sp_train <- Y_train[ ,1]
y_sp_test  <- Y_test[ ,1]

fit <- km(~., design = X_train, response = y_sp_train)
## 
## optimisation start
## ------------------
## * estimation method   : MLE 
## * optimisation method : BFGS 
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io + 
##     dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io + 
##     g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io + 
##     lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io + 
##     retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io + 
##     sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model : 
##   - type :  matern5_2 
##   - nugget : NO
##   - parameters lower bounds :  1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 
##   - parameters upper bounds :  1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477 
##   - best initial criterion value(s) :  -1726.705 
## 
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate     0  f=       1726.7  |proj g|=       1.8815
## At iterate     1  f =       1712.4  |proj g|=             2
## At iterate     2  f =       1693.2  |proj g|=        1.8923
## At iterate     3  f =         1674  |proj g|=        1.8193
## ys=-6.121e+00  -gs= 1.622e+01, BFGS update SKIPPED
## At iterate     4  f =         1655  |proj g|=        1.9166
## At iterate     5  f =       1641.2  |proj g|=        1.8914
## At iterate     6  f =       1639.3  |proj g|=        1.8347
## At iterate     7  f =       1636.3  |proj g|=        1.8278
## At iterate     8  f =         1636  |proj g|=        1.8383
## At iterate     9  f =       1635.7  |proj g|=        1.8206
## At iterate    10  f =       1635.5  |proj g|=         1.815
## At iterate    11  f =       1635.2  |proj g|=        1.8053
## At iterate    12  f =         1635  |proj g|=        1.7998
## At iterate    13  f =       1634.5  |proj g|=        1.8116
## At iterate    14  f =       1633.1  |proj g|=        1.8167
## At iterate    15  f =       1632.4  |proj g|=        1.8118
## At iterate    16  f =       1631.6  |proj g|=         1.673
## At iterate    17  f =       1631.4  |proj g|=        1.6592
## At iterate    18  f =       1631.2  |proj g|=        1.8107
## At iterate    19  f =       1630.9  |proj g|=        1.5978
## At iterate    20  f =       1630.8  |proj g|=        1.8108
## At iterate    21  f =       1630.4  |proj g|=         1.809
## At iterate    22  f =       1629.8  |proj g|=        1.4884
## At iterate    23  f =       1629.7  |proj g|=        1.4814
## At iterate    24  f =       1629.5  |proj g|=        1.8098
## At iterate    25  f =       1628.9  |proj g|=        1.8094
## At iterate    26  f =       1628.7  |proj g|=        1.7352
## At iterate    27  f =       1628.5  |proj g|=        1.2822
## At iterate    28  f =       1628.2  |proj g|=        1.8079
## At iterate    29  f =       1628.1  |proj g|=         1.166
## At iterate    30  f =       1627.9  |proj g|=       0.95812
## At iterate    31  f =       1627.9  |proj g|=        0.9477
## At iterate    32  f =       1627.9  |proj g|=       0.81417
## At iterate    33  f =       1627.9  |proj g|=       0.45921
## At iterate    34  f =       1627.9  |proj g|=       0.51692
## At iterate    35  f =       1627.9  |proj g|=       0.22284
## At iterate    36  f =       1627.9  |proj g|=       0.13913
## At iterate    37  f =       1627.9  |proj g|=       0.84427
## At iterate    38  f =       1627.9  |proj g|=       0.13065
## At iterate    39  f =       1627.9  |proj g|=      0.074607
## At iterate    40  f =       1627.9  |proj g|=      0.073202
## At iterate    41  f =       1627.9  |proj g|=       0.12392
## At iterate    42  f =       1627.9  |proj g|=       0.38423
## At iterate    43  f =       1627.9  |proj g|=       0.32069
## At iterate    44  f =       1627.9  |proj g|=       0.75815
## At iterate    45  f =       1627.9  |proj g|=        0.1233
## At iterate    46  f =       1627.9  |proj g|=      0.039419
## At iterate    47  f =       1627.9  |proj g|=      0.025198
## At iterate    48  f =       1627.9  |proj g|=      0.011113
## At iterate    49  f =       1627.9  |proj g|=      0.008441
## 
## iterations 49
## function evaluations 65
## segments explored during Cauchy searches 70
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 25
## norm of the final projected gradient 0.00844101
## final function value 1627.88
## 
## F = 1627.88
## final  value 1627.877599 
## converged
pred <- predict.km(fit, newdata = X_test, type = 'UK')

It looks like the emulator is struggling to predict those “zero carbon” ensemble members - we should consider removing them.

par(mfrow = c(1,2))
plot(y_sp_test, pred$mean, main = 'Test prediction')
abline(0,1)

plot(y_sp_test - pred$mean, main = 'error' )

Does a “two-step” emulator help?

The two-step emulator picks inputs by building a glmnet as a first step and removing inputs that get shrunk to zero. It doesn’t seem to help much.

twoStep_glmnet <- function(X, y, nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
                           REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
  # Use lasso to reduce input dimension of emulator before
  # building.
  control_list = list(trace=trace, maxit=maxit, REPORT=REPORT, factr=factr, pgtol=pgtol, pop.size=popsize)
  xvars = colnames(X)
  data = data.frame(response=y, x=X)
  colnames(data) <- c("response", xvars)
  nval = length(y)
  
  # fit a lasso by cross validation
  library(glmnet)
  fit_glmnet_cv = cv.glmnet(x=X,y=y)
  
  # The labels of the retained coefficients are here
  # (retains intercept at index zero)
  coef_i = (coef(fit_glmnet_cv, s = "lambda.1se"))@i
  labs = labels(coef(fit_glmnet_cv, s = "lambda.1se"))[[1]]
  labs = labs[-1] # remove intercept
  glmnet_retained = labs[coef_i]
  
  start_form = as.formula(paste("~ ", paste(glmnet_retained , collapse= "+")))
  m = km(start_form, design=X, response=y, nugget=nugget, parinit=parinit,
         nugget.estim=nuggetEstim, noise.var=noiseVar, control=control_list)
  
  return(list(x=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim,
              noiseVar=noiseVar, emulator=m, seed=seed, coefs=m@covariance@range.val,
              trends=m@trend.coef, meanTerms=all.vars(start_form), fit_glmnet_cv=fit_glmnet_cv))
}
twostep_test1 <- twoStep_glmnet(X = X_train, y = pca$x[,1])
## Loading required package: Matrix
## Loaded glmnet 4.1-2
twostep_test2 <- twoStep_glmnet(X = X_train, y = pca$x[,2])
twostep_test3 <- twoStep_glmnet(X = X_train, y = pca$x[,3])
twostep_loo1 <- leaveOneOut.km(model = twostep_test1$emulator, type = 'UK', trend.reestim = TRUE)

The twostep error is a little lower. Not much though. It also hasn’t solved the problem of predicting the “zero carbon” members well.

# T

plot(pca$x[,1], scores_em_mean_train[,1])
points(pca$x[,1], twostep_loo1$mean, col = 'red')

abline(0,1)

mean(abs(scores_em_mean_train[,1] - pca$x[,1]))
## [1] 3.43188
mean(abs(twostep_loo1$mean - pca$x[,1]))
## [1] 3.371337

How well do we reproduce timeseries anomaly (trend, change) data?

First, anomalize the timeseries matrix

anomalizeTSmatrix = function(x, ix){
  # Anomalise a timeseries matrix
  subx = x[ ,ix]
  sweepstats = apply(subx, 1, FUN=mean)
  anom = sweep(x, 1, sweepstats, FUN = '-')
  anom
}

npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)

More PCs are important this time, so we will need to keep more.

Y_anom <- npp_anom_ens_wave00
Y_train_anom <- Y_anom[train_ix, years_ix]
Y_test_anom  <- Y_anom[test_ix, years_ix]

pca_anom <- prcomp(Y_train_anom, center = TRUE, scale = TRUE)

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train_anom), 
        type = 'l',
        lty = 'solid',
        lwd = 1.6,
        col = makeTransparent('black', 35),
        xlab = 'years',
        ylab = 'NPP anomaly',
        main = 'Training set',
        ylim = c(-5,40)
        )


plot(pca_anom)

# How many principle components do we wish to keep? 
npc <- 4

scores_train_anom <- pca_anom$x[ ,1:npc]

# project the truncated scores back up, to see how well they reproduce the 
# original data
  
#anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
#tens <- t(anom + pca$center)
# Run the emulators in parallel
library(parallel)

Train four emulators and recombine to make a prediction

scores_train_anom_list <- mat2list(scores_train_anom)

fit_list_scores_train_anom <- mclapply(X = scores_train_anom_list, FUN = km, formula = ~., design = X_train,
                                   mc.cores = 4, control = list(trace = FALSE))
scores_train_anom_em_loo <- matrix(ncol = ncol(scores_train_anom), nrow = nrow(scores_train_anom))

for(i in 1:ncol(scores_train_anom_em_loo)){
  
  pred <- leaveOneOut.km(model = fit_list_scores_train_anom[[i]], type = 'UK', trend.reestim = TRUE)
  scores_train_anom_em_loo[ ,i] <- pred$mean
  
}

How well do we predict the PC scores of the training data set? There still seems to be a problem in PC1

par(mfrow = c(2,2))

for(i in 1:npc){

plot(scores_train_anom[,i], scores_train_anom_em_loo[,i], main = paste('PC', i))
  abline(0,1)
}

anom_anom_loo <- pca_anom$rotation[ ,1:npc] %*% t(scores_train_anom_em_loo[,1:npc])*pca_anom$scale
anom_tens_loo <- t(anom_anom_loo + pca_anom$center)

#anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
#tens_test <- t(anom_test + pca$center)

Y_anom_loo_err <- anom_tens_loo - Y_train_anom

par(mfrow = c(1,2))

matplot(years_trunc, t(Y_train_anom), 
        type = 'l',
        lty = 'solid',
        lwd = 1.5,
        col = makeTransparent('black', 35),
        xlab = 'years',
        ylab = 'test',
        main = 'Training reconstruction',
        ylim = c(-10,30)
        )

matlines(years_trunc, t(anom_tens_loo), 
        type = 'l',
        lty = 'solid',
         lwd = 1.5,
       col = makeTransparent('red', 35)
        )

matplot(years_trunc, t(Y_anom_loo_err), 
        type = 'l',
        lty = 'solid',
         lwd = 1.5,
        col = makeTransparent('black', 35),
        xlab = 'years',
        ylab = 'train - predicted output',
        main = 'Training prediction error',
        ylim = c(-15,15)
        )

## Training data reconstructions as Sparklines

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(20, 20), mar = c(0.1, 0.1, 0.1, 0.1))

for(i in 1:398){
  
  plot(Y_train_anom[i, ], axes = FALSE, type = 'n', ylim = c(-10,30))
  
    # Plot region color
rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
     col = "lightgrey", border = 'lightgrey') # Color

  lines(Y_train_anom[i, ],col = 'black')

  lines(anom_tens_loo[i,], col = 'red')
  
}

NPP anomaly test data predictions

scores_test_anom_em <- NULL

for(i in 1:npc){
  
  pred <- predict.km(fit_list_scores_train_anom[[i]], newdata = X_test, type = 'UK')
  scores_test_anom_em <- cbind(scores_test_anom_em, pred$mean)
  
}
#anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
#tens_loo <- t(anom_loo + pca$center)

anom_anom <- pca_anom$rotation[ ,1:2] %*% t(scores_test_anom_em[,1:2])*pca_anom$scale
tens_anom_test <- t(anom_anom + pca_anom$center)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(1,1,5,1))

for(i in 1:100){
  
  
    plot(Y_test_anom[i, ], axes = FALSE, type = 'n', lty = 'solid', ylim = c(-10,30))
    # Plot region color
rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
     col = "lightgrey", border = 'lightgrey') # Color
  
  lines(Y_test_anom[i, ], col = 'black')
  lines(tens_anom_test[i,], col = 'red')
  
}

mtext(side = 3, text = 'NPP anomaly test set predictions', outer = TRUE, cex = 1.5, line = 3)
reset()
legend('topleft', legend = c('observed', 'predicted'), lty = 'solid', col = c('black', 'red'), horiz = TRUE)

# Key should be that it gives you back the training fits, which you can then use for newdata etc.

pca_km <- function(X, Y, train_ix, test_ix, npc, scale = FALSE, center = TRUE, ...){
  # emulate high dimensional output  
  
  require(parallel)
  
  # Split into training and test samples
  X_train <- X[train_ix, ]
  X_test  <- X[test_ix, ]
  
  Y_train <- Y[train_ix, ]
  Y_test  <- Y[test_ix, ]
  
  
  #reduce dimension of the output
  pca <- prcomp(Y_train, scale = scale, center = center)
  
  
  scores_train_list <- mat2list(pca$x[, 1:npc])
  
  # fitting the emulator is a slow step, so we use parallel computation
  # Fit an emulator to each principal component in turn
  fit_list <- mclapply(X = scores_train_list, FUN = km, formula = ~., design = X_train,
                       mc.cores = 4, control = list(trace = FALSE, maxit = 200))
  
  scores_em_test_mean  <- NULL
  scores_em_test_sd    <- NULL
  
  scores_em_train_mean <- NULL
  scores_em_train_sd   <- NULL
  
  for(i in 1:npc){
    
    loo <- leaveOneOut.km(fit_list[[i]], type = 'UK', trend.reestim = TRUE)
    pred_test <- predict(fit_list[[i]], newdata = X_test, type = 'UK')
    
    # Predict training data (low dimension representation)
    scores_em_train_mean <- cbind(scores_em_train_mean, loo$mean)
    scores_em_train_sd <- cbind(scores_em_train_sd, loo$sd)                            
    
    # Predict test data (low dimension representation)                         
    scores_em_test_mean <- cbind(scores_em_test_mean, pred_test$mean)
    scores_em_test_sd   <- cbind(scores_em_test_sd, pred_test$sd)
    
  }
  
  # Project back up to high dimension
  if(scale){
    anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])*pca$scale
    anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])*pca$scale
    
    sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])*pca$scale)
    sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])*pca$scale)
  }
  
  else{
    anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])
    anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])
    
    sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc]))
    sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc]))
    
  }
  
  Ypred_train <- t(anom_train + pca$center)
  Ypred_test <- t(anom_test + pca$center)
  
  Yerr_train <- Y_train - Ypred_train
  Yerr_test <- Y_test - Ypred_test
  
  return(list(train_ix = train_ix,
              test_ix = test_ix,
              X_train = X_train,
              X_test = X_test,
              pca = pca,
              fit_list = fit_list,
              scores_em_train_mean = scores_em_train_mean,
              scores_em_train_sd = scores_em_train_sd,
              scores_em_test_mean = scores_em_test_mean,
              scores_em_test_sd = scores_em_test_sd,
              Ypred_train = Ypred_train,
              Ypred_test = Ypred_test,
              sd_train = sd_train,
              sd_test = sd_test
  ))
  
}

Are the pre-1950 years messing it up? Maybe, but it’s still not fixed.

# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164

#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix  <- 400:499

X_trunc<- X[-288, ]
Y_trunc <- npp_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498

pc_km_npp <- pca_km(X = X_trunc,
                     Y_trunc,
                     train_ix = train_ix, test_ix = test_ix, 
                     npc = 3, 
                     scale = FALSE,
                     center = TRUE)
plot(pc_km_npp$pca$x[,1],pc_km_npp$scores_em_train_mean[,1] )
abline(0,1)

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1))

for(i in 1:100){
  
  
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-50, 200), type = 'n', axes = FALSE)
  
  rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
      col = "grey90", border = "grey90") # Color
  
  lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black')
  
  
  lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red')
  
}

npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)

Predict NPP anomaly post 1950

There are still more examples of missed predictions than we’d like.

# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164

#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix  <- 400:499

X_trunc<- X[-288, ]
Y_trunc <- npp_anom_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498

pc_km_npp_anom <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 5, 
                      scale = TRUE,
                      center = TRUE)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))

for(i in 1:100){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'n', axes = FALSE)
  
    rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
      col = "grey90", border = "#f7f7f7") # Color
    
  lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black')
    
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red')
  
}

reset()

mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')

par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40),
        lwd= 2,
        main = "Train",
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = makeTransparent('red',40), lwd = 2)

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
        main = 'Test',
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

How do things change if we exclude “zero carbon cycle” ensemble members?

McNeall et al (2023) found that carbon cycle tends to die if f0 > 0.9 or b_wl < 0.15. If these are right, they seem pretty good.

Predict NPP with excised “Zero Carbon” ensemble members

# Excise ensemble members we know to cause problems
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc <- X[keep_ix, ]
Y_trunc <- npp_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374

pc_km_npp  <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 3, 
                      scale = TRUE,
                      center = TRUE)
par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = "Train",
        ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_train), lty = 'solid', col = 'red')

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = 'black',
        main = 'Test',
        ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_test), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1,0.1,0.1,0.1), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-20, 130), type = 'n', axes = FALSE)
  
    rect(par("usr")[1], par("usr")[3],
    par("usr")[2], par("usr")[4],
    col = "grey90", border = "grey90") # Color
        
  lines(years_trunc, (Y_trunc[test_ix, ])[i, ], lwd = 1.5)
        
  lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red', lwd = 1.5)
  lines(years_trunc, pc_km_npp$Ypred_test[i, ] +(2* pc_km_npp$sd_test[i, ]),  col = makeTransparent('red', 100))
  lines(years_trunc, pc_km_npp$Ypred_test[i, ] -(2* pc_km_npp$sd_test[i, ]),  col = makeTransparent('red', 100))
  
}

reset()

mtext(text = 'NPP Test Set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))

plot(Y_trunc[test_ix, ], pc_km_npp$Ypred_test, xlim = c(0,120), ylim = c(0,120))
abline(0,1)

Predict NPP anomaly with excised “zero carbon” ensemble members

keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc<- X[keep_ix, ]
Y_trunc <- npp_anom_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374

pc_km_npp_anom <- pca_km(X = X_trunc,
                      Y_trunc,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 5, 
                      scale = TRUE,
                      center = TRUE)
par(mfrow = c(2,1))


matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
        main = "Train",
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = makeTransparent('red',40), lwd = 2)

legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')


matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
        main = 'Test',
        ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  
  plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'n', axes = FALSE)
  
      rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
      col = "grey90", border = "grey90") # Color
      
  lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black', lwd = 1.5)
      
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red', lwd = 1.5)
  
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ] + 2*(pc_km_npp_anom$sd_test[i, ]), col = makeTransparent('red',100))
  lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ] - 2*(pc_km_npp_anom$sd_test[i, ]), col = makeTransparent('red',100))
  
  
}

reset()

mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))

How much better is the emulator with more data added?

The second wave of JULES runs (wave01) shouldn’t have any real problems with “bad” members, as most should be drawn from (relatively) good parts of parameter space

keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc <- X[keep_ix, ]
Y_trunc <- npp_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374

train_ix_wave01 <- 1:300
test_ix_wave01 <- 301:400

X_trunc_append <- rbind(X_trunc, X_wave01_train)
Y_trunc_append <- rbind(Y_trunc,npp_ens_wave01[ , years_ix ] )




#X_trunc_train_append <- rbind(X_trunc[train_ix, ], X_wave01_train[train_ix_wave01, ])
#Y_trunc_train_append <- rbind(Y_trunc[train_ix, ],npp_ens_wave01[train_ix_wave01, years_ix ] )

#X_trunc_test_append <- rbind(X_trunc[test_ix, ], X_wave01_train[test_ix_wave01, ])
#Y_trunc_test_append <- rbind(Y_trunc[test_ix, ], npp_ens_wave01[test_ix_wave01, years_ix ] )
train_ix <- c(1:300, 375:674)
test_ix <- c(301:374, 675:774)

pc_km_npp_append <- pca_km(X = X_trunc_append,
                      Y_trunc_append,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 3, 
                      scale = TRUE,
                      center = TRUE)
plot(pc_km_npp_append$pca)

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 20), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  
  plot(years_trunc, (Y_trunc_append[test_ix, ])[i, ], ylim = c(-10, 120), type = 'n', axes = FALSE)
  
      rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
      col = "grey90", border = "grey90") # Color
      
  lines(years_trunc, (Y_trunc_append[test_ix, ])[i, ], col = 'black', lwd = 1.5)
      
  lines(years_trunc, pc_km_npp_append$Ypred_test[i, ], col = 'red', lwd = 1.5)
  
  lines(years_trunc, pc_km_npp_append$Ypred_test[i, ] + 2*(pc_km_npp_append$sd_test[i, ]), col = makeTransparent('red',100))
  lines(years_trunc, pc_km_npp_append$Ypred_test[i, ] - 2*(pc_km_npp_append$sd_test[i, ]), col = makeTransparent('red',100))
  
  
}

reset()

mtext(text = 'NPP test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))

npp_anom_ens_wave01 <- anomalizeTSmatrix(npp_ens_wave01, 90:111)

keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]

X_trunc <- X[keep_ix, ]
Y_trunc <- npp_anom_ens_wave00[keep_ix, years_ix ]


X_trunc_append <- rbind(X_trunc, X_wave01_train)
Y_trunc_append <- rbind(Y_trunc,npp_anom_ens_wave01[ , years_ix ] )
train_ix <- c(1:300, 375:674)
test_ix <- c(301:374, 675:774)

pc_km_npp_anom_append <- pca_km(X = X_trunc_append,
                      Y_trunc_append,
                       train_ix = train_ix, 
                      test_ix = test_ix, 
                       npc = 5, 
                      scale = TRUE,
                      center = TRUE)
plot(pc_km_npp_anom_append$pca)

# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 20), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))

for(i in 1:length(test_ix)){
  
  
  plot(years_trunc, (Y_trunc_append[test_ix, ])[i, ], ylim = c(-2, 20), type = 'n', axes = FALSE)
  
      rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
      col = "grey90", border = "grey90") # Color
      
  lines(years_trunc, (Y_trunc_append[test_ix, ])[i, ], col = 'black', lwd = 1.5)
      
  lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ], col = 'red', lwd = 1.5)
  
  lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ] + 2*(pc_km_npp_anom_append$sd_test[i, ]), col = makeTransparent('red',100))
  lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ] - 2*(pc_km_npp_anom_append$sd_test[i, ]), col = makeTransparent('red',100))
  
  
}

reset()

mtext(text = 'NPP test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))

Test with cumulative NBP

cnbp_ens_wave00 <-  t(apply(nbp_ens_wave00, 1, FUN = cumsum))


matplot(years, t(cnbp_ens_wave00[-288,]), col = 'black', type = 'l', lty = 'solid')