This notebook build a high-dimensional emulator of timeseries of JULES global mean carbon cycle outputs. It uses PCA to reduce the dimension
of the output, and build a Gaussian Process emulator of the output in reduced dimension. We start with NPP and NPP anomaly.
# Libraries
library(DiceKriging)
# Helper functions
mat2list <- function(X){
# Turns the p columns of a matrix into a p length list,
# with each column becoming an element of the list
out <- vector(mode = 'list', length = ncol(X))
for(i in 1:ncol(X)){
out[[i]] <- X[ , i]
}
out
}
reset <- function() {
# Allows annotation of graphs, resets axes
par(mfrow=c(1, 1), oma=rep(0, 4), mar=rep(0, 4), new=TRUE)
plot(0:1, 0:1, type="n", xlab="", ylab="", axes=FALSE)
}
makeTransparent<-function(someColor, alpha=100)
# Transparent colours for plotting
{
newColor<-col2rgb(someColor)
apply(newColor, 2, function(curcoldata){rgb(red=curcoldata[1], green=curcoldata[2],
blue=curcoldata[3],alpha=alpha, maxColorValue=255)})
}
# Load adata
load('data/ensemble-jules-historical-timeseries.RData')
NAs in the data
count_na <- function(x){
sum(is.na(x))
}
count_na(npp_ens_wave00)
## [1] 0
ens_charvec <- ls(pattern = "ens_wave00")
for(i in ens_charvec){
print(count_na(get(i)))
}
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
We use the last 20% of the runs as a test set (they are random in order)
# How about we look at post 1950? Maybe that is messing it up?
years_ix <- 101:164
train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
test_ix <- 400:499
X_train <- X[train_ix, ]
X_test <- X[test_ix, ]
Y_train <- npp_ens_wave00[train_ix, years_ix ]
Y_test <- npp_ens_wave00[test_ix, years_ix]
years_trunc <- years[years_ix]
matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid', col = 'black',
xlab = 'years', ylab = 'NPP (training output)',
ylim = c(0,180))
## Perform PCA
How many PCs do we need to keep? This plot would suggest a single PC does very well.
# perform a PCA on the training output
pca <- prcomp(Y_train, center = TRUE, scale = TRUE)
plot(pca)
# How many principle components do we wish to keep?
npc <- 3
scores <- pca$x[ ,1:npc]
# project the truncated scores back up, to see how well they reproduce the
# original data
anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
tens <- t(anom + pca$center)
Reconstruct the training data with the perfect scores and truncated number of PCs - this is the smallest error we can expect with an emulation.
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_train), type = 'l', lty = 'solid',
col = 'black', xlab = 'years', ylab = 'NPP (training output)',
main = 'reconstruction',
ylim = c(0,200))
matlines(years_trunc, t(tens), lty = 'solid', col = 'red')
legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))
Y_train_err <- tens - Y_train
matplot(years_trunc, t(Y_train_err),
type = 'l',
lty = 'solid',
col = 'black',
xlab = 'years',
ylab = 'training output',
main = 'error',
ylim = c(-4,4)
)
scores_em_mean_test <- NULL
scores_em_sd_test <- NULL
scores_em_mean_train <- NULL
for(i in 1:npc){
#
y <- pca$x[,i]
fit <- km(~., design = X_train, response = y)
loo <- leaveOneOut.km(fit, type = 'UK', trend.reestim = TRUE)
pred_test <- predict(fit, newdata = X_test, type = 'UK')
scores_em_mean <- pred_test$mean
scores_em_sd <- pred_test$sd
scores_em_mean_test <- cbind(scores_em_mean_test, scores_em_mean)
scores_em_sd_test <- cbind(scores_em_sd_test, scores_em_sd)
scores_em_mean_train <- cbind(scores_em_mean_train, loo$mean)
}
##
## optimisation start
## ------------------
## * estimation method : MLE
## * optimisation method : BFGS
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io +
## dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io +
## g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io +
## lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io +
## retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io +
## sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model :
## - type : matern5_2
## - nugget : NO
## - parameters lower bounds : 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10
## - parameters upper bounds : 1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477
## - best initial criterion value(s) : -1286.659
##
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate 0 f= 1286.7 |proj g|= 1.8853
## At iterate 1 f = 1248.1 |proj g|= 1.456
## At iterate 2 f = 1241.7 |proj g|= 1.9318
## At iterate 3 f = 1233.2 |proj g|= 1.9311
## ys=-1.488e+00 -gs= 7.777e+00, BFGS update SKIPPED
## At iterate 4 f = 1222.8 |proj g|= 1.8771
## At iterate 5 f = 1211.7 |proj g|= 1.7988
## At iterate 6 f = 1202.5 |proj g|= 1.7691
## At iterate 7 f = 1197.8 |proj g|= 1.8207
## At iterate 8 f = 1195.1 |proj g|= 1.7605
## At iterate 9 f = 1193.5 |proj g|= 1.8294
## At iterate 10 f = 1193.1 |proj g|= 1.7433
## At iterate 11 f = 1192.5 |proj g|= 1.0071
## At iterate 12 f = 1189.9 |proj g|= 0.89427
## At iterate 13 f = 1186.6 |proj g|= 0.9149
## At iterate 14 f = 1185.5 |proj g|= 1.8162
## At iterate 15 f = 1184.5 |proj g|= 1.8178
## At iterate 16 f = 1184 |proj g|= 1.0408
## At iterate 17 f = 1183.6 |proj g|= 1.1663
## At iterate 18 f = 1183.2 |proj g|= 1.7243
## At iterate 19 f = 1182.5 |proj g|= 1.8146
## At iterate 20 f = 1182.3 |proj g|= 1.81
## At iterate 21 f = 1182.2 |proj g|= 1.2211
## At iterate 22 f = 1182.2 |proj g|= 1.0004
## At iterate 23 f = 1181.5 |proj g|= 0.95857
## At iterate 24 f = 1181.1 |proj g|= 0.82354
## At iterate 25 f = 1181 |proj g|= 0.8134
## At iterate 26 f = 1181 |proj g|= 0.78801
## At iterate 27 f = 1180.9 |proj g|= 1.8065
## At iterate 28 f = 1180.8 |proj g|= 1.8082
## At iterate 29 f = 1180.8 |proj g|= 1.7545
## At iterate 30 f = 1180.8 |proj g|= 0.66165
## At iterate 31 f = 1180.8 |proj g|= 1.489
## At iterate 32 f = 1180.8 |proj g|= 0.54519
## At iterate 33 f = 1180.8 |proj g|= 0.57186
## At iterate 34 f = 1180.7 |proj g|= 1.5258
## At iterate 35 f = 1180.7 |proj g|= 1.7555
## At iterate 36 f = 1180.7 |proj g|= 1.6149
## At iterate 37 f = 1180.7 |proj g|= 0.66015
## At iterate 38 f = 1180.7 |proj g|= 0.46509
## At iterate 39 f = 1180.7 |proj g|= 0.25264
## At iterate 40 f = 1180.7 |proj g|= 0.25305
## At iterate 41 f = 1180.7 |proj g|= 0.25209
## At iterate 42 f = 1180.7 |proj g|= 0.16265
## At iterate 43 f = 1180.7 |proj g|= 0.20515
## At iterate 44 f = 1180.7 |proj g|= 0.013583
## At iterate 45 f = 1180.7 |proj g|= 0.01352
##
## iterations 45
## function evaluations 56
## segments explored during Cauchy searches 60
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 26
## norm of the final projected gradient 0.0135197
## final function value 1180.69
##
## F = 1180.69
## final value 1180.686865
## converged
##
## optimisation start
## ------------------
## * estimation method : MLE
## * optimisation method : BFGS
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io +
## dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io +
## g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io +
## lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io +
## retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io +
## sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model :
## - type : matern5_2
## - nugget : NO
## - parameters lower bounds : 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10
## - parameters upper bounds : 1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477
## - best initial criterion value(s) : 168.5318
##
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate 0 f= -168.53 |proj g|= 1.7977
## At iterate 1 f = -169.12 |proj g|= 1.7219
## At iterate 2 f = -181.65 |proj g|= 1.4016
## At iterate 3 f = -184.12 |proj g|= 1.8291
## At iterate 4 f = -185.77 |proj g|= 1.6628
## At iterate 5 f = -188.5 |proj g|= 1.7391
## At iterate 6 f = -189.44 |proj g|= 1.6012
## At iterate 7 f = -193.3 |proj g|= 1.71
## At iterate 8 f = -194.29 |proj g|= 1.7176
## At iterate 9 f = -197.07 |proj g|= 1.7039
## At iterate 10 f = -198.32 |proj g|= 1.6584
## At iterate 11 f = -198.9 |proj g|= 1.639
## At iterate 12 f = -199.68 |proj g|= 1.6895
## At iterate 13 f = -199.95 |proj g|= 1.2701
## At iterate 14 f = -200.6 |proj g|= 1.6119
## At iterate 15 f = -200.99 |proj g|= 0.79384
## At iterate 16 f = -201.3 |proj g|= 1.7058
## At iterate 17 f = -201.58 |proj g|= 1.6921
## At iterate 18 f = -201.97 |proj g|= 0.75943
## At iterate 19 f = -202.48 |proj g|= 0.96771
## At iterate 20 f = -202.61 |proj g|= 0.90947
## At iterate 21 f = -202.78 |proj g|= 1.0436
## At iterate 22 f = -202.96 |proj g|= 0.86211
## At iterate 23 f = -203.08 |proj g|= 0.81763
## At iterate 24 f = -203.13 |proj g|= 0.77121
## At iterate 25 f = -203.16 |proj g|= 0.74209
## At iterate 26 f = -203.3 |proj g|= 1.3186
## At iterate 27 f = -203.32 |proj g|= 0.40967
## At iterate 28 f = -203.38 |proj g|= 0.3241
## At iterate 29 f = -203.43 |proj g|= 0.47016
## At iterate 30 f = -203.46 |proj g|= 0.42563
## At iterate 31 f = -203.49 |proj g|= 1.5172
## At iterate 32 f = -203.51 |proj g|= 0.50401
## At iterate 33 f = -203.52 |proj g|= 0.42811
## At iterate 34 f = -203.53 |proj g|= 0.51492
## At iterate 35 f = -203.53 |proj g|= 0.27638
## At iterate 36 f = -203.53 |proj g|= 0.14328
## At iterate 37 f = -203.54 |proj g|= 0.18107
## At iterate 38 f = -203.54 |proj g|= 0.080715
## At iterate 39 f = -203.54 |proj g|= 0.058844
## At iterate 40 f = -203.54 |proj g|= 0.033909
## At iterate 41 f = -203.54 |proj g|= 0.059998
## At iterate 42 f = -203.54 |proj g|= 0.18704
## At iterate 43 f = -203.54 |proj g|= 0.034582
## At iterate 44 f = -203.54 |proj g|= 0.03858
## At iterate 45 f = -203.54 |proj g|= 0.04233
## At iterate 46 f = -203.54 |proj g|= 0.055205
## At iterate 47 f = -203.54 |proj g|= 0.13286
## At iterate 48 f = -203.54 |proj g|= 0.014659
## At iterate 49 f = -203.54 |proj g|= 0.011585
## At iterate 50 f = -203.54 |proj g|= 0.040125
## At iterate 51 f = -203.54 |proj g|= 0.034438
## At iterate 52 f = -203.54 |proj g|= 0.012412
## At iterate 53 f = -203.54 |proj g|= 0.0075297
## At iterate 54 f = -203.54 |proj g|= 0.018592
## At iterate 55 f = -203.54 |proj g|= 0.012649
## At iterate 56 f = -203.54 |proj g|= 0.021603
## At iterate 57 f = -203.54 |proj g|= 0.0042614
## At iterate 58 f = -203.54 |proj g|= 0.004044
## At iterate 59 f = -203.54 |proj g|= 0.0032159
## At iterate 60 f = -203.54 |proj g|= 0.015432
## At iterate 61 f = -203.54 |proj g|= 0.0030054
## At iterate 62 f = -203.54 |proj g|= 0.0038065
## At iterate 63 f = -203.54 |proj g|= 0.0040125
## At iterate 64 f = -203.54 |proj g|= 0.010214
## At iterate 65 f = -203.54 |proj g|= 0.0021606
## At iterate 66 f = -203.54 |proj g|= 0.0016152
##
## iterations 66
## function evaluations 74
## segments explored during Cauchy searches 73
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 19
## norm of the final projected gradient 0.0016152
## final function value -203.537
##
## F = -203.537
## final value -203.537367
## converged
##
## optimisation start
## ------------------
## * estimation method : MLE
## * optimisation method : BFGS
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io +
## dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io +
## g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io +
## lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io +
## retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io +
## sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model :
## - type : matern5_2
## - nugget : NO
## - parameters lower bounds : 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10
## - parameters upper bounds : 1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477
## - best initial criterion value(s) : 686.9548
##
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate 0 f= -686.95 |proj g|= 1.7948
## At iterate 1 f = -693.89 |proj g|= 1.8917
## At iterate 2 f = -694.45 |proj g|= 1.5386
## At iterate 3 f = -695.16 |proj g|= 1.6179
## At iterate 4 f = -699.23 |proj g|= 1.7849
## At iterate 5 f = -701.05 |proj g|= 1.7567
## At iterate 6 f = -702.13 |proj g|= 1.8679
## At iterate 7 f = -702.6 |proj g|= 1.6509
## At iterate 8 f = -703.99 |proj g|= 1.6352
## At iterate 9 f = -704.31 |proj g|= 1.6159
## At iterate 10 f = -704.57 |proj g|= 1.8675
## At iterate 11 f = -705.59 |proj g|= 1.8629
## At iterate 12 f = -706.28 |proj g|= 1.832
## At iterate 13 f = -706.98 |proj g|= 1.5766
## At iterate 14 f = -707.37 |proj g|= 1.862
## At iterate 15 f = -707.57 |proj g|= 1.5452
## At iterate 16 f = -707.63 |proj g|= 1.8585
## At iterate 17 f = -707.71 |proj g|= 1.5331
## At iterate 18 f = -708.19 |proj g|= 1.7726
## At iterate 19 f = -710.6 |proj g|= 1.7883
## At iterate 20 f = -711.06 |proj g|= 1.7749
## At iterate 21 f = -711.35 |proj g|= 1.8495
## At iterate 22 f = -711.53 |proj g|= 1.8398
## At iterate 23 f = -711.85 |proj g|= 0.89124
## At iterate 24 f = -712.03 |proj g|= 1.232
## At iterate 25 f = -712.52 |proj g|= 1.2038
## At iterate 26 f = -712.58 |proj g|= 1.109
## At iterate 27 f = -712.65 |proj g|= 0.80976
## At iterate 28 f = -712.78 |proj g|= 0.58775
## At iterate 29 f = -712.86 |proj g|= 0.48864
## At iterate 30 f = -712.91 |proj g|= 0.75393
## At iterate 31 f = -713.03 |proj g|= 0.92058
## At iterate 32 f = -713.08 |proj g|= 0.45941
## At iterate 33 f = -713.12 |proj g|= 1.1598
## At iterate 34 f = -713.13 |proj g|= 0.50621
## At iterate 35 f = -713.2 |proj g|= 1.596
## At iterate 36 f = -713.26 |proj g|= 1.602
## At iterate 37 f = -713.31 |proj g|= 1.5961
## At iterate 38 f = -713.32 |proj g|= 0.9729
## At iterate 39 f = -713.34 |proj g|= 0.34083
## At iterate 40 f = -713.35 |proj g|= 0.62053
## At iterate 41 f = -713.36 |proj g|= 0.42734
## At iterate 42 f = -713.37 |proj g|= 0.37401
## At iterate 43 f = -713.43 |proj g|= 1.5428
## At iterate 44 f = -713.48 |proj g|= 1.5656
## At iterate 45 f = -713.5 |proj g|= 1.8119
## At iterate 46 f = -713.5 |proj g|= 0.33387
## At iterate 47 f = -713.51 |proj g|= 0.24663
## At iterate 48 f = -713.51 |proj g|= 0.37505
## At iterate 49 f = -713.51 |proj g|= 0.35893
## At iterate 50 f = -713.52 |proj g|= 0.19192
## At iterate 51 f = -713.52 |proj g|= 0.62754
## At iterate 52 f = -713.52 |proj g|= 0.29661
## At iterate 53 f = -713.52 |proj g|= 0.25122
## At iterate 54 f = -713.52 |proj g|= 0.08321
## At iterate 55 f = -713.52 |proj g|= 0.061262
## At iterate 56 f = -713.52 |proj g|= 0.10055
## At iterate 57 f = -713.52 |proj g|= 0.082903
## At iterate 58 f = -713.52 |proj g|= 0.17544
## At iterate 59 f = -713.52 |proj g|= 0.10645
## At iterate 60 f = -713.52 |proj g|= 0.062303
## At iterate 61 f = -713.52 |proj g|= 0.074458
## At iterate 62 f = -713.52 |proj g|= 0.16492
## At iterate 63 f = -713.52 |proj g|= 0.14377
## At iterate 64 f = -713.52 |proj g|= 0.04009
## At iterate 65 f = -713.52 |proj g|= 0.13043
## At iterate 66 f = -713.52 |proj g|= 0.039867
## At iterate 67 f = -713.52 |proj g|= 0.048104
## At iterate 68 f = -713.52 |proj g|= 0.053999
## At iterate 69 f = -713.52 |proj g|= 0.064402
## At iterate 70 f = -713.52 |proj g|= 0.086968
## At iterate 71 f = -713.52 |proj g|= 0.055652
## At iterate 72 f = -713.52 |proj g|= 0.083874
## At iterate 73 f = -713.52 |proj g|= 0.031257
## At iterate 74 f = -713.52 |proj g|= 0.017897
## At iterate 75 f = -713.52 |proj g|= 0.01501
## At iterate 76 f = -713.52 |proj g|= 0.029294
## At iterate 77 f = -713.52 |proj g|= 0.045344
## At iterate 78 f = -713.52 |proj g|= 0.11612
## At iterate 79 f = -713.52 |proj g|= 0.02058
## At iterate 80 f = -713.52 |proj g|= 0.0078525
## At iterate 81 f = -713.52 |proj g|= 0.0088307
##
## iterations 81
## function evaluations 92
## segments explored during Cauchy searches 91
## BFGS updates skipped 0
## active bounds at final generalized Cauchy point 21
## norm of the final projected gradient 0.00883074
## final function value -713.525
##
## F = -713.525
## final value -713.524947
## converged
anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
tens_loo <- t(anom_loo + pca$center)
anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
tens_test <- t(anom_test + pca$center)
How well does the GP do in predicting the training data in the reduced dimension space?
par(mfrow = c(1,3))
for(i in 1:npc){
plot( pca$x[,i], scores_em_mean_train[,i], xlab = 'observed', ylab = 'predicted', main = paste('PC', i))
abline(0,1)
}
In the data space, for the training data set.
Y_loo_err <- tens_loo - Y_train
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_train),
type = 'l',
lty = 'solid',
col = 'black',
xlab = 'years',
ylab = 'test',
main = 'LOO training reconstruction',
ylim = c(-50,200)
)
matlines(years_trunc, t(tens_loo),
type = 'l',
lty = 'solid',
col = 'red',
)
legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))
matplot(years_trunc, t(Y_loo_err),
type = 'l',
lty = 'solid',
col = 'black',
xlab = 'years',
ylab = 'NPP error',
main = 'LOO train prediction error',
ylim = c(-100,100)
)
Y_test_err <- tens_test - Y_test
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_test),
type = 'l',
lty = 'solid',
col = 'black',
xlab = 'years',
ylab = 'test',
main = 'Test reconstruction',
ylim = c(-50,200)
)
matlines(years_trunc, t(tens_test),
type = 'l',
lty = 'solid',
col = 'red',
)
legend('topleft', lty = 'solid', col =c('black', 'red'), legend = c('observed', 'predicted'))
matplot(years_trunc, t(Y_test_err),
type = 'l',
lty = 'solid',
col = 'black',
xlab = 'years',
ylab = 'test - predicted output',
main = 'Test prediction error',
ylim = c(-100,100)
)
## Are “Zero Carbon” runs polluting the emulator? How about testing how good emulating the first (or last) point is? Are the “zero carbon cycle” and “failures” causing problems with the emulator?
y_sp_train <- Y_train[ ,1]
y_sp_test <- Y_test[ ,1]
fit <- km(~., design = X_train, response = y_sp_train)
##
## optimisation start
## ------------------
## * estimation method : MLE
## * optimisation method : BFGS
## * analytical gradient : used
## * trend model : ~alpha_io + a_wl_io + bio_hum_cn + b_wl_io + dcatch_dlai_io +
## dqcrit_io + dz0v_dh_io + f0_io + fd_io + g_area_io + g_root_io +
## g_wood_io + gs_nvg_io + hw_sw_io + kaps_roth + knl_io + lai_max_io +
## lai_min_io + lma_io + n_inorg_turnover + nmass_io + nr_io +
## retran_l_io + retran_r_io + r_grow_io + rootd_ft_io + sigl_io +
## sorp + tleaf_of_io + tlow_io + tupp_io + l_vg_soil
## * covariance model :
## - type : matern5_2
## - nugget : NO
## - parameters lower bounds : 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10 1e-10
## - parameters upper bounds : 1.998344 2 1.994471 1.999927 2 2 2 1.996699 2 1.996789 2 2 2 2 1.999374 1.99598 1.998257 2 2 1.996693 1.998361 2 2 2 1.991674 2 2 1.986556 2 2 2 1.998477
## - best initial criterion value(s) : -1726.705
##
## N = 32, M = 5 machine precision = 2.22045e-16
## At X0, 0 variables are exactly at the bounds
## At iterate 0 f= 1726.7 |proj g|= 1.8815
## At iterate 1 f = 1712.4 |proj g|= 2
## At iterate 2 f = 1693.2 |proj g|= 1.8923
## At iterate 3 f = 1674 |proj g|= 1.8193
## ys=-6.121e+00 -gs= 1.622e+01, BFGS update SKIPPED
## At iterate 4 f = 1655 |proj g|= 1.9166
## At iterate 5 f = 1641.2 |proj g|= 1.8914
## At iterate 6 f = 1639.3 |proj g|= 1.8347
## At iterate 7 f = 1636.3 |proj g|= 1.8278
## At iterate 8 f = 1636 |proj g|= 1.8383
## At iterate 9 f = 1635.7 |proj g|= 1.8206
## At iterate 10 f = 1635.5 |proj g|= 1.815
## At iterate 11 f = 1635.2 |proj g|= 1.8053
## At iterate 12 f = 1635 |proj g|= 1.7998
## At iterate 13 f = 1634.5 |proj g|= 1.8116
## At iterate 14 f = 1633.1 |proj g|= 1.8167
## At iterate 15 f = 1632.4 |proj g|= 1.8118
## At iterate 16 f = 1631.6 |proj g|= 1.673
## At iterate 17 f = 1631.4 |proj g|= 1.6592
## At iterate 18 f = 1631.2 |proj g|= 1.8107
## At iterate 19 f = 1630.9 |proj g|= 1.5978
## At iterate 20 f = 1630.8 |proj g|= 1.8108
## At iterate 21 f = 1630.4 |proj g|= 1.809
## At iterate 22 f = 1629.8 |proj g|= 1.4884
## At iterate 23 f = 1629.7 |proj g|= 1.4814
## At iterate 24 f = 1629.5 |proj g|= 1.8098
## At iterate 25 f = 1628.9 |proj g|= 1.8094
## At iterate 26 f = 1628.7 |proj g|= 1.7352
## At iterate 27 f = 1628.5 |proj g|= 1.2822
## At iterate 28 f = 1628.2 |proj g|= 1.8079
## At iterate 29 f = 1628.1 |proj g|= 1.166
## At iterate 30 f = 1627.9 |proj g|= 0.95812
## At iterate 31 f = 1627.9 |proj g|= 0.9477
## At iterate 32 f = 1627.9 |proj g|= 0.81417
## At iterate 33 f = 1627.9 |proj g|= 0.45921
## At iterate 34 f = 1627.9 |proj g|= 0.51692
## At iterate 35 f = 1627.9 |proj g|= 0.22284
## At iterate 36 f = 1627.9 |proj g|= 0.13913
## At iterate 37 f = 1627.9 |proj g|= 0.84427
## At iterate 38 f = 1627.9 |proj g|= 0.13065
## At iterate 39 f = 1627.9 |proj g|= 0.074607
## At iterate 40 f = 1627.9 |proj g|= 0.073202
## At iterate 41 f = 1627.9 |proj g|= 0.12392
## At iterate 42 f = 1627.9 |proj g|= 0.38423
## At iterate 43 f = 1627.9 |proj g|= 0.32069
## At iterate 44 f = 1627.9 |proj g|= 0.75815
## At iterate 45 f = 1627.9 |proj g|= 0.1233
## At iterate 46 f = 1627.9 |proj g|= 0.039419
## At iterate 47 f = 1627.9 |proj g|= 0.025198
## At iterate 48 f = 1627.9 |proj g|= 0.011113
## At iterate 49 f = 1627.9 |proj g|= 0.008441
##
## iterations 49
## function evaluations 65
## segments explored during Cauchy searches 70
## BFGS updates skipped 1
## active bounds at final generalized Cauchy point 25
## norm of the final projected gradient 0.00844101
## final function value 1627.88
##
## F = 1627.88
## final value 1627.877599
## converged
pred <- predict.km(fit, newdata = X_test, type = 'UK')
It looks like the emulator is struggling to predict those “zero carbon” ensemble members - we should consider removing them.
par(mfrow = c(1,2))
plot(y_sp_test, pred$mean, main = 'Test prediction')
abline(0,1)
plot(y_sp_test - pred$mean, main = 'error' )
The two-step emulator picks inputs by building a glmnet as a first step and removing inputs that get shrunk to zero. It doesn’t seem to help much.
twoStep_glmnet <- function(X, y, nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
# Use lasso to reduce input dimension of emulator before
# building.
control_list = list(trace=trace, maxit=maxit, REPORT=REPORT, factr=factr, pgtol=pgtol, pop.size=popsize)
xvars = colnames(X)
data = data.frame(response=y, x=X)
colnames(data) <- c("response", xvars)
nval = length(y)
# fit a lasso by cross validation
library(glmnet)
fit_glmnet_cv = cv.glmnet(x=X,y=y)
# The labels of the retained coefficients are here
# (retains intercept at index zero)
coef_i = (coef(fit_glmnet_cv, s = "lambda.1se"))@i
labs = labels(coef(fit_glmnet_cv, s = "lambda.1se"))[[1]]
labs = labs[-1] # remove intercept
glmnet_retained = labs[coef_i]
start_form = as.formula(paste("~ ", paste(glmnet_retained , collapse= "+")))
m = km(start_form, design=X, response=y, nugget=nugget, parinit=parinit,
nugget.estim=nuggetEstim, noise.var=noiseVar, control=control_list)
return(list(x=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim,
noiseVar=noiseVar, emulator=m, seed=seed, coefs=m@covariance@range.val,
trends=m@trend.coef, meanTerms=all.vars(start_form), fit_glmnet_cv=fit_glmnet_cv))
}
twostep_test1 <- twoStep_glmnet(X = X_train, y = pca$x[,1])
## Loading required package: Matrix
## Loaded glmnet 4.1-2
twostep_test2 <- twoStep_glmnet(X = X_train, y = pca$x[,2])
twostep_test3 <- twoStep_glmnet(X = X_train, y = pca$x[,3])
twostep_loo1 <- leaveOneOut.km(model = twostep_test1$emulator, type = 'UK', trend.reestim = TRUE)
The twostep error is a little lower. Not much though. It also hasn’t solved the problem of predicting the “zero carbon” members well.
# T
plot(pca$x[,1], scores_em_mean_train[,1])
points(pca$x[,1], twostep_loo1$mean, col = 'red')
abline(0,1)
mean(abs(scores_em_mean_train[,1] - pca$x[,1]))
## [1] 3.43188
mean(abs(twostep_loo1$mean - pca$x[,1]))
## [1] 3.371337
First, anomalize the timeseries matrix
anomalizeTSmatrix = function(x, ix){
# Anomalise a timeseries matrix
subx = x[ ,ix]
sweepstats = apply(subx, 1, FUN=mean)
anom = sweep(x, 1, sweepstats, FUN = '-')
anom
}
npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)
More PCs are important this time, so we will need to keep more.
Y_anom <- npp_anom_ens_wave00
Y_train_anom <- Y_anom[train_ix, years_ix]
Y_test_anom <- Y_anom[test_ix, years_ix]
pca_anom <- prcomp(Y_train_anom, center = TRUE, scale = TRUE)
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_train_anom),
type = 'l',
lty = 'solid',
lwd = 1.6,
col = makeTransparent('black', 35),
xlab = 'years',
ylab = 'NPP anomaly',
main = 'Training set',
ylim = c(-5,40)
)
plot(pca_anom)
# How many principle components do we wish to keep?
npc <- 4
scores_train_anom <- pca_anom$x[ ,1:npc]
# project the truncated scores back up, to see how well they reproduce the
# original data
#anom <- pca$rotation[ ,1:npc] %*% t(scores)*pca$scale
#tens <- t(anom + pca$center)
# Run the emulators in parallel
library(parallel)
Train four emulators and recombine to make a prediction
scores_train_anom_list <- mat2list(scores_train_anom)
fit_list_scores_train_anom <- mclapply(X = scores_train_anom_list, FUN = km, formula = ~., design = X_train,
mc.cores = 4, control = list(trace = FALSE))
scores_train_anom_em_loo <- matrix(ncol = ncol(scores_train_anom), nrow = nrow(scores_train_anom))
for(i in 1:ncol(scores_train_anom_em_loo)){
pred <- leaveOneOut.km(model = fit_list_scores_train_anom[[i]], type = 'UK', trend.reestim = TRUE)
scores_train_anom_em_loo[ ,i] <- pred$mean
}
How well do we predict the PC scores of the training data set? There still seems to be a problem in PC1
par(mfrow = c(2,2))
for(i in 1:npc){
plot(scores_train_anom[,i], scores_train_anom_em_loo[,i], main = paste('PC', i))
abline(0,1)
}
anom_anom_loo <- pca_anom$rotation[ ,1:npc] %*% t(scores_train_anom_em_loo[,1:npc])*pca_anom$scale
anom_tens_loo <- t(anom_anom_loo + pca_anom$center)
#anom_test <- pca$rotation[ ,1:2] %*% t(scores_em_mean_test[,1:2])*pca$scale
#tens_test <- t(anom_test + pca$center)
Y_anom_loo_err <- anom_tens_loo - Y_train_anom
par(mfrow = c(1,2))
matplot(years_trunc, t(Y_train_anom),
type = 'l',
lty = 'solid',
lwd = 1.5,
col = makeTransparent('black', 35),
xlab = 'years',
ylab = 'test',
main = 'Training reconstruction',
ylim = c(-10,30)
)
matlines(years_trunc, t(anom_tens_loo),
type = 'l',
lty = 'solid',
lwd = 1.5,
col = makeTransparent('red', 35)
)
matplot(years_trunc, t(Y_anom_loo_err),
type = 'l',
lty = 'solid',
lwd = 1.5,
col = makeTransparent('black', 35),
xlab = 'years',
ylab = 'train - predicted output',
main = 'Training prediction error',
ylim = c(-15,15)
)
## Training data reconstructions as Sparklines
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(20, 20), mar = c(0.1, 0.1, 0.1, 0.1))
for(i in 1:398){
plot(Y_train_anom[i, ], axes = FALSE, type = 'n', ylim = c(-10,30))
# Plot region color
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "lightgrey", border = 'lightgrey') # Color
lines(Y_train_anom[i, ],col = 'black')
lines(anom_tens_loo[i,], col = 'red')
}
scores_test_anom_em <- NULL
for(i in 1:npc){
pred <- predict.km(fit_list_scores_train_anom[[i]], newdata = X_test, type = 'UK')
scores_test_anom_em <- cbind(scores_test_anom_em, pred$mean)
}
#anom_loo <- pca$rotation[ ,1:2] %*% t(scores_em_mean_train[,1:2])*pca$scale
#tens_loo <- t(anom_loo + pca$center)
anom_anom <- pca_anom$rotation[ ,1:2] %*% t(scores_test_anom_em[,1:2])*pca_anom$scale
tens_anom_test <- t(anom_anom + pca_anom$center)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(1,1,5,1))
for(i in 1:100){
plot(Y_test_anom[i, ], axes = FALSE, type = 'n', lty = 'solid', ylim = c(-10,30))
# Plot region color
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "lightgrey", border = 'lightgrey') # Color
lines(Y_test_anom[i, ], col = 'black')
lines(tens_anom_test[i,], col = 'red')
}
mtext(side = 3, text = 'NPP anomaly test set predictions', outer = TRUE, cex = 1.5, line = 3)
reset()
legend('topleft', legend = c('observed', 'predicted'), lty = 'solid', col = c('black', 'red'), horiz = TRUE)
# Key should be that it gives you back the training fits, which you can then use for newdata etc.
pca_km <- function(X, Y, train_ix, test_ix, npc, scale = FALSE, center = TRUE, ...){
# emulate high dimensional output
require(parallel)
# Split into training and test samples
X_train <- X[train_ix, ]
X_test <- X[test_ix, ]
Y_train <- Y[train_ix, ]
Y_test <- Y[test_ix, ]
#reduce dimension of the output
pca <- prcomp(Y_train, scale = scale, center = center)
scores_train_list <- mat2list(pca$x[, 1:npc])
# fitting the emulator is a slow step, so we use parallel computation
# Fit an emulator to each principal component in turn
fit_list <- mclapply(X = scores_train_list, FUN = km, formula = ~., design = X_train,
mc.cores = 4, control = list(trace = FALSE, maxit = 200))
scores_em_test_mean <- NULL
scores_em_test_sd <- NULL
scores_em_train_mean <- NULL
scores_em_train_sd <- NULL
for(i in 1:npc){
loo <- leaveOneOut.km(fit_list[[i]], type = 'UK', trend.reestim = TRUE)
pred_test <- predict(fit_list[[i]], newdata = X_test, type = 'UK')
# Predict training data (low dimension representation)
scores_em_train_mean <- cbind(scores_em_train_mean, loo$mean)
scores_em_train_sd <- cbind(scores_em_train_sd, loo$sd)
# Predict test data (low dimension representation)
scores_em_test_mean <- cbind(scores_em_test_mean, pred_test$mean)
scores_em_test_sd <- cbind(scores_em_test_sd, pred_test$sd)
}
# Project back up to high dimension
if(scale){
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])*pca$scale
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])*pca$scale
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc])*pca$scale)
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc])*pca$scale)
}
else{
anom_train <- pca$rotation[ ,1:npc] %*% t(scores_em_train_mean[,1:npc])
anom_test <- pca$rotation[ ,1:npc] %*% t(scores_em_test_mean[,1:npc])
sd_train <- t(pca$rotation[ ,1:npc] %*% t(scores_em_train_sd[,1:npc]))
sd_test <- t(pca$rotation[ ,1:npc] %*% t(scores_em_test_sd[,1:npc]))
}
Ypred_train <- t(anom_train + pca$center)
Ypred_test <- t(anom_test + pca$center)
Yerr_train <- Y_train - Ypred_train
Yerr_test <- Y_test - Ypred_test
return(list(train_ix = train_ix,
test_ix = test_ix,
X_train = X_train,
X_test = X_test,
pca = pca,
fit_list = fit_list,
scores_em_train_mean = scores_em_train_mean,
scores_em_train_sd = scores_em_train_sd,
scores_em_test_mean = scores_em_test_mean,
scores_em_test_sd = scores_em_test_sd,
Ypred_train = Ypred_train,
Ypred_test = Ypred_test,
sd_train = sd_train,
sd_test = sd_test
))
}
# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164
#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix <- 400:499
X_trunc<- X[-288, ]
Y_trunc <- npp_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498
pc_km_npp <- pca_km(X = X_trunc,
Y_trunc,
train_ix = train_ix, test_ix = test_ix,
npc = 3,
scale = FALSE,
center = TRUE)
plot(pc_km_npp$pca$x[,1],pc_km_npp$scores_em_train_mean[,1] )
abline(0,1)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1))
for(i in 1:100){
plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-50, 200), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black')
lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red')
}
npp_anom_ens_wave00 <- anomalizeTSmatrix(npp_ens_wave00, 90:111)
There are still more examples of missed predictions than we’d like.
# How about we look at post 1950? Maybe that is messing it up?
#years_ix <- 101:164
#train_ix <- (1:399)[-288] # Number 288 has numerical problems for NPP
#test_ix <- 400:499
X_trunc<- X[-288, ]
Y_trunc <- npp_anom_ens_wave00[-288, years_ix ]
train_ix = 1:398
test_ix = 399:498
pc_km_npp_anom <- pca_km(X = X_trunc,
Y_trunc,
train_ix = train_ix,
test_ix = test_ix,
npc = 5,
scale = TRUE,
center = TRUE)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))
for(i in 1:100){
plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "#f7f7f7") # Color
lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black')
lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red')
}
reset()
mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')
par(mfrow = c(2,1))
matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40),
lwd= 2,
main = "Train",
ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
main = 'Test',
ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
McNeall et al (2023) found that carbon cycle tends to die if f0 > 0.9 or b_wl < 0.15. If these are right, they seem pretty good.
# Excise ensemble members we know to cause problems
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]
X_trunc <- X[keep_ix, ]
Y_trunc <- npp_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374
pc_km_npp <- pca_km(X = X_trunc,
Y_trunc,
train_ix = train_ix,
test_ix = test_ix,
npc = 3,
scale = TRUE,
center = TRUE)
par(mfrow = c(2,1))
matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = 'black',
main = "Train",
ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_train), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = 'black',
main = 'Test',
ylab = 'NPP from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp$Ypred_test), lty = 'solid', col = 'red')
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1,0.1,0.1,0.1), oma = c(3,0.1,3,0.1))
for(i in 1:length(test_ix)){
plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-20, 130), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(years_trunc, (Y_trunc[test_ix, ])[i, ], lwd = 1.5)
lines(years_trunc, pc_km_npp$Ypred_test[i, ], col = 'red', lwd = 1.5)
lines(years_trunc, pc_km_npp$Ypred_test[i, ] +(2* pc_km_npp$sd_test[i, ]), col = makeTransparent('red', 100))
lines(years_trunc, pc_km_npp$Ypred_test[i, ] -(2* pc_km_npp$sd_test[i, ]), col = makeTransparent('red', 100))
}
reset()
mtext(text = 'NPP Test Set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red'), legend = c('observed', 'predicted'), horiz = TRUE, lty = 'solid')
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))
plot(Y_trunc[test_ix, ], pc_km_npp$Ypred_test, xlim = c(0,120), ylim = c(0,120))
abline(0,1)
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]
X_trunc<- X[keep_ix, ]
Y_trunc <- npp_anom_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374
pc_km_npp_anom <- pca_km(X = X_trunc,
Y_trunc,
train_ix = train_ix,
test_ix = test_ix,
npc = 5,
scale = TRUE,
center = TRUE)
par(mfrow = c(2,1))
matplot(years_trunc, t(Y_trunc[train_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
main = "Train",
ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_train), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
matplot(years_trunc, t(Y_trunc[test_ix, ]), type = 'l', lty = 'solid', col = makeTransparent('black',40), lwd = 2,
main = 'Test',
ylab = 'NPP anomaly from 1940 - 1959 mean')
matlines(years_trunc, t(pc_km_npp_anom$Ypred_test), lty = 'solid', col = makeTransparent('red',40), lwd = 2)
legend('topleft', col = c('black', 'red'), legend = c('Observed', 'Predicted'), lty = 'solid')
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 10), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))
for(i in 1:length(test_ix)){
plot(years_trunc, (Y_trunc[test_ix, ])[i, ], ylim = c(-5, 20), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(years_trunc, (Y_trunc[test_ix, ])[i, ], col = 'black', lwd = 1.5)
lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ], col = 'red', lwd = 1.5)
lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ] + 2*(pc_km_npp_anom$sd_test[i, ]), col = makeTransparent('red',100))
lines(years_trunc, pc_km_npp_anom$Ypred_test[i, ] - 2*(pc_km_npp_anom$sd_test[i, ]), col = makeTransparent('red',100))
}
reset()
mtext(text = 'NPP anomaly test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))
The second wave of JULES runs (wave01) shouldn’t have any real problems with “bad” members, as most should be drawn from (relatively) good parts of parameter space
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]
X_trunc <- X[keep_ix, ]
Y_trunc <- npp_ens_wave00[keep_ix, years_ix ]
train_ix = 1:300
test_ix = 301:374
train_ix_wave01 <- 1:300
test_ix_wave01 <- 301:400
X_trunc_append <- rbind(X_trunc, X_wave01_train)
Y_trunc_append <- rbind(Y_trunc,npp_ens_wave01[ , years_ix ] )
#X_trunc_train_append <- rbind(X_trunc[train_ix, ], X_wave01_train[train_ix_wave01, ])
#Y_trunc_train_append <- rbind(Y_trunc[train_ix, ],npp_ens_wave01[train_ix_wave01, years_ix ] )
#X_trunc_test_append <- rbind(X_trunc[test_ix, ], X_wave01_train[test_ix_wave01, ])
#Y_trunc_test_append <- rbind(Y_trunc[test_ix, ], npp_ens_wave01[test_ix_wave01, years_ix ] )
train_ix <- c(1:300, 375:674)
test_ix <- c(301:374, 675:774)
pc_km_npp_append <- pca_km(X = X_trunc_append,
Y_trunc_append,
train_ix = train_ix,
test_ix = test_ix,
npc = 3,
scale = TRUE,
center = TRUE)
plot(pc_km_npp_append$pca)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 20), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))
for(i in 1:length(test_ix)){
plot(years_trunc, (Y_trunc_append[test_ix, ])[i, ], ylim = c(-10, 120), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(years_trunc, (Y_trunc_append[test_ix, ])[i, ], col = 'black', lwd = 1.5)
lines(years_trunc, pc_km_npp_append$Ypred_test[i, ], col = 'red', lwd = 1.5)
lines(years_trunc, pc_km_npp_append$Ypred_test[i, ] + 2*(pc_km_npp_append$sd_test[i, ]), col = makeTransparent('red',100))
lines(years_trunc, pc_km_npp_append$Ypred_test[i, ] - 2*(pc_km_npp_append$sd_test[i, ]), col = makeTransparent('red',100))
}
reset()
mtext(text = 'NPP test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))
npp_anom_ens_wave01 <- anomalizeTSmatrix(npp_ens_wave01, 90:111)
keep_ix <- which(X[, 'f0_io'] < 0.9 & X[, 'b_wl_io'] > 0.15 )
keep_ix <- keep_ix[ - which(keep_ix == 288)]
X_trunc <- X[keep_ix, ]
Y_trunc <- npp_anom_ens_wave00[keep_ix, years_ix ]
X_trunc_append <- rbind(X_trunc, X_wave01_train)
Y_trunc_append <- rbind(Y_trunc,npp_anom_ens_wave01[ , years_ix ] )
train_ix <- c(1:300, 375:674)
test_ix <- c(301:374, 675:774)
pc_km_npp_anom_append <- pca_km(X = X_trunc_append,
Y_trunc_append,
train_ix = train_ix,
test_ix = test_ix,
npc = 5,
scale = TRUE,
center = TRUE)
plot(pc_km_npp_anom_append$pca)
# seeing sparkline pairs of ensemble members would be good
par(mfrow = c(10, 20), mar = c(0.1, 0.1, 0.1, 0.1), oma = c(3,0.1,3,0.1))
for(i in 1:length(test_ix)){
plot(years_trunc, (Y_trunc_append[test_ix, ])[i, ], ylim = c(-2, 20), type = 'n', axes = FALSE)
rect(par("usr")[1], par("usr")[3],
par("usr")[2], par("usr")[4],
col = "grey90", border = "grey90") # Color
lines(years_trunc, (Y_trunc_append[test_ix, ])[i, ], col = 'black', lwd = 1.5)
lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ], col = 'red', lwd = 1.5)
lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ] + 2*(pc_km_npp_anom_append$sd_test[i, ]), col = makeTransparent('red',100))
lines(years_trunc, pc_km_npp_anom_append$Ypred_test[i, ] - 2*(pc_km_npp_anom_append$sd_test[i, ]), col = makeTransparent('red',100))
}
reset()
mtext(text = 'NPP test set predictions', side = 3, cex = 1.5, line = -2, outer = TRUE)
legend('bottom', col = c('black', 'red', makeTransparent('red',100)), legend = c('observed', 'predicted', '+- 2sd'), horiz = TRUE, lty = 'solid', lwd = c(1.5,1.5,1))
cnbp_ens_wave00 <- t(apply(nbp_ens_wave00, 1, FUN = cumsum))
matplot(years, t(cnbp_ens_wave00[-288,]), col = 'black', type = 'l', lty = 'solid')