library(SmoothPLS)
library(pls)
#> Warning: le package 'pls' a été compilé avec la version R 4.4.3
#>
#> Attachement du package : 'pls'
#> L'objet suivant est masqué depuis 'package:stats':
#>
#> loadingsThis notebook show the Smooth PLS algorithm for a multi-states Categorical Functional Data (MS-CFD).
N_states = 3
nind = 100 # number of individuals (train set)
start = 0 # First time
end = 100 # end time
curve_type = 'cat'
TTRatio = 0.2 # Train Test Ratio means we have floor(nind*TTRatio/(1-TTRatio))
NotS_ratio = 0.2 # noise variance over total variance for Y
beta_0_real=65.4321 # Intercept value for the link between X(t) and Y
nbasis = 10 # number of basis functions
norder = 4 # 4 for cubic splines basis
regul_time = seq(start, end, 5) # regularisation time sequence
regul_time_0 = seq(start, end, 1)
int_mode = 1 # in case of integration errors.All the states will share the same basis.
basis = create_bspline_basis(start, end, nbasis, norder)
#basis = fda::create.fourier.basis(c(start,end), nbasis = nbasis)
# All the states will share the same basis.
basis_list = obj_list_creation(N_rep = N_states, obj = basis)
plot(basis, main=paste0(nbasis, " ", basis$type," functions basis"))We have to prepare the data before the FPLS method. For each state, we build its indicator function.
df_processed = cat_data_to_indicator(df_cfd)
length(df_processed)
#> [1] 3
names(df_processed)
#> [1] "state_1" "state_2" "state_3"head(df_processed$state_3, 20)
#> id time state
#> 1 1 0.0000000 1
#> 2 1 0.4182081 0
#> 3 1 1.5448560 1
#> 4 1 5.7064580 0
#> 6 1 44.2917255 1
#> 7 1 49.3986723 0
#> 12 1 94.5885722 1
#> 13 1 100.0000000 1
#> 14 2 0.0000000 0
#> 16 2 23.5972614 1
#> 17 2 35.2188747 0
#> 19 2 62.3399729 1
#> 20 2 69.0495525 0
#> 23 2 100.0000000 0
#> 24 3 0.0000000 1
#> 25 3 7.7964939 0
#> 27 3 15.5383881 1
#> 28 3 26.7390645 0
#> 29 3 38.7901396 1
#> 30 3 60.9841073 0for(i in 1:length(beta_func_list)){
plot(0:end, beta_func_list[[i]](0:end, end_time = end),
ylab=paste0("Beta(t) n°=", i), type = 'l')
title(paste0("Beta(t) n°=", i))
}Y generation is based on the following equation : $Y = 0 + {i=1}^K _0^T _i(t) ind_i(t) dt $ with \(ind_i(t) = \{0, 1\}_{t \in [0, T]}\) the indicator function of the state \(i\).
We link \(\beta_i\) with the \(state_i\).
Y_df = generate_Y_df(df = df_processed, curve_type = curve_type,
beta_real_func_or_list = beta_func_list,
beta_0_real = beta_0_real,
NotS_ratio = NotS_ratio)
Y = Y_df$Y_noised
names(Y_df)
#> [1] "id" "Y_beta1" "Y_beta2" "Y_beta3" "Y_real" "Y_noised"head(Y_df)
#> id Y_beta1 Y_beta2 Y_beta3 Y_real Y_noised
#> 1 1 65.806580 -8.252748 -7.4320011 115.55393 117.36995
#> 2 2 26.027793 3.651140 0.2430665 95.35410 98.68405
#> 3 3 18.239666 -6.248977 -19.0326323 58.39016 102.56355
#> 4 4 5.205643 22.113474 -19.5805494 73.17067 85.04207
#> 5 5 -12.639639 38.985054 -9.9986223 81.77889 49.19584
#> 6 6 -13.391616 -20.803539 -20.0241998 11.21275 -6.47791naivePLS_obj = naivePLS(df_list = df_cfd, Y = Y, regul_time_obj = regul_time,
curve_type_obj = 'cat',
id_col_obj = 'id', time_col_obj = 'time',
print_steps = TRUE, plot_rmsep = TRUE,
print_nbComp = TRUE,plot_reg_curves = TRUE)
#> ### Naive PLS ###
#> => Input format assertions.
#> => Input format assertions OK.
#> => Data formatting.
#> => PLS model.#> [1] "Optimal number of PLS components : 2"
fpls_obj = funcPLS(df_list = df_cfd, Y = Y_df$Y_noised,
basis_obj = basis,
curve_type_obj = 'cat',
regul_time_obj = regul_time,
id_col_obj = 'id', time_col_obj = 'time',
print_steps = TRUE, plot_rmsep = TRUE,
print_nbComp = TRUE, plot_reg_curves = TRUE)
#> ### Functional PLS ###
#> => Input format assertions.
#> => Input format assertions OK.
#> => Building alpha matrix.
#> => Building curve names.
#> ==> Evaluating alpha for : CatFD_1_1.
#> ==> Evaluating alpha for : CatFD_1_2.
#> ==> Evaluating alpha for : CatFD_1_3.
#> => Evaluate metrix and root_metric.
#> => plsr(Y ~ trans_alphas).#> Optimal number of PLS components : 2 .
#> => Build Intercept and regression curves for optimal number of components.
#> ==> Build regression curve for : CatFD_1_1
#> ==> Build regression curve for : CatFD_1_2
#> ==> Build regression curve for : CatFD_1_3
spls_obj = smoothPLS(df_list = df_cfd, Y = Y_df$Y_noised, basis_obj = basis,
orth_obj = TRUE, curve_type_obj = 'cat',
int_mode = 1,
id_col_obj ='id', time_col_obj = 'time',
print_steps = TRUE, plot_rmsep = TRUE,
print_nbComp = TRUE, plot_reg_curves = TRUE)
#> ### Smooth PLS ###
#> ## Use parallelization in case of heavy computational load. ##
#> ## Threshold can be manualy adjusted : (default 2500) ##
#> ## >options(SmoothPLS.parallel_threshold = 500) ##
#> => Input format assertions.
#> => Input format assertions OK.
#> => Orthonormalize basis.
#> => Data objects formatting.
#> => Evaluate Lambda matrix.
#> ==> Lambda for : CatFD_1_state_1.
#> ==> Lambda for : CatFD_1_state_2.
#> ==> Lambda for : CatFD_1_state_3.
#> => PLSR model.
#> => Optimal number of PLS components : 2#> => Evaluate SmoothPLS functions and <w_i, p_j> coef.
#> => Build regression functions and intercept.
#> ==> Build regression curve for : CatFD_1_state_1
#> ==> Build regression curve for : CatFD_1_state_2
#> ==> Build regression curve for : CatFD_1_state_3
# Warning ms_spls_obj$delta_ms_list[[1]] is the intercept!
cat("curve_1 : smooth PLS regression curve.\n")
#> curve_1 : smooth PLS regression curve.
cat("curve_2 : functional PLS regression curve.\n")
#> curve_2 : functional PLS regression curve.
cat("curve_3 : naive PLS regression coefficients\n")
#> curve_3 : naive PLS regression coefficients
for(i in 1:N_states){
start = 0
print(paste0("State_", (i)))
evaluate_curves_distances(real_f = beta_func_list[[i]],
regul_time = regul_time,
fun_fd_list = list(spls_obj$reg_obj[[i+1]],
fpls_obj$reg_obj[[i+1]],
approxfun(
x = regul_time,
y = naivePLS_obj$opti_reg_coef[
start:(start+length(regul_time)
)])
)
)
start = start + length(regul_time)
}
#> [1] "State_1"
#> [1] "real_f -> curve_1 / INPROD : 47.4572000558795 / DIST : 13.2905098633588"
#> [1] "real_f -> curve_2 / INPROD : 43.6591352214833 / DIST : 13.6320162598076"
#> [1] "real_f -> curve_3 / INPROD : 13.4690073074092 / DIST : 27.8941820901434"
#> [1] "State_2"
#> [1] "real_f -> curve_1 / INPROD : 22.9684162701822 / DIST : 14.1561671305604"
#> [1] "real_f -> curve_2 / INPROD : 25.7051668331251 / DIST : 15.742963397086"
#> [1] "real_f -> curve_3 / INPROD : 31.4665194389073 / DIST : 38.1269198992419"
#> [1] "State_3"
#> [1] "real_f -> curve_1 / INPROD : 25.9590236019394 / DIST : 15.1535445344829"
#> [1] "real_f -> curve_2 / INPROD : 27.0203400870638 / DIST : 11.7454038364192"
#> [1] "real_f -> curve_3 / INPROD : -64.0495576976778 / DIST : 37.3104708298011"for(i in 1:N_states){
start = 0
y_lim = eval_max_min_y(f_list = list(spls_obj$reg_ob[[i+1]],
fpls_obj$reg_ob[[i+1]],
approxfun(
x = regul_time,
y = naivePLS_obj$opti_reg_coef[
start:(start+length(regul_time))]),
beta_func_list[[i]]
),
regul_time = regul_time_0)
plot(regul_time_0, beta_func_list[[i]](regul_time_0), col = 'black',
ylim = y_lim, xlab = 'Time', ylab = 'Value', type = 'l')
lines(regul_time_0, approxfun(x = regul_time,
y = naivePLS_obj$opti_reg_coef[
start:(start+
length(regul_time))])(regul_time_0),
col = 'green')
title(paste0(names(spls_obj$reg_obj)[i+1], " regression curves"))
plot(spls_obj$reg_obj[[i+1]], col = 'blue', add = TRUE)
plot(fpls_obj$reg_obj[[i+1]], col = 'red', add = TRUE)
legend("topleft",
legend = c("Real curve", "NaivePLS coef",
"SmoothPLS reg curve", "FunctionalPLS reg curve"),
col = c("black", "green", "blue", "red"),
lty = 1,
lwd = 1)
start = start + length(regul_time)
}train_results = data.frame(matrix(ncol = 5, nrow = 3))
colnames(train_results) = c("PRESS", "RMSE", "MAE", "R2", "var_Y")
rownames(train_results) = c("NaivePLS", "FPLS", "SmoothPLS")
test_results = train_resultsY_train = Y_df$Y_noised
# Naive
Y_hat = predict(naivePLS_obj$plsr_model,
ncomp = naivePLS_obj$nbCP_opti,
newdata = naivePLS_obj$plsr_model$model$`as.matrix(df_mod_wide)`)
train_results["NaivePLS", ] = evaluate_results(Y_train, Y_hat)
# FPLS
Y_hat_fpls = (predict(fpls_obj$plsr_model, ncomp = fpls_obj$nbCP_opti,
newdata = fpls_obj$trans_alphas)
+ fpls_obj$reg_obj$Intercept
+ mean(Y))
Y_hat_fpls = smoothPLS_predict(df_predict_list = df_cfd,
delta_list = fpls_obj$reg_obj,
curve_type_obj = curve_type,
int_mode = int_mode,
regul_time_obj = regul_time)
train_results["FPLS", ] = evaluate_results(Y_train, Y_hat_fpls)
# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df_cfd,
delta_list = spls_obj$reg_obj,
curve_type_obj = curve_type,
int_mode = int_mode,
regul_time_obj = regul_time)
train_results["SmoothPLS", ] = evaluate_results(Y_train, Y_hat_spls)
train_results["NaivePLS", "nb_cp"] = naivePLS_obj$nbCP_opti
train_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
train_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_optiY_test = Y_df_test$Y_noised
# Naive
df_test_wide = naivePLS_formatting(df_list = df_test,
regul_time_obj = regul_time,
curve_type_obj = curve_type,
id_col_obj = 'id', time_col_obj = 'time')
Y_hat = predict(naivePLS_obj$plsr_model,
ncomp = naivePLS_obj$nbCP_opti,
newdata = as.matrix(df_test_wide))
test_results["NaivePLS", ] = evaluate_results(Y_test, Y_hat)
# FPLS
Y_hat_fpls = smoothPLS_predict(df_predict_list = df_test,
delta_list = fpls_obj$reg_obj,
curve_type_obj = curve_type,
int_mode = int_mode,
regul_time_obj = regul_time)
test_results["FPLS", ] = evaluate_results(Y_test, Y_hat_fpls)
# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df_test,
delta_list = spls_obj$reg_obj,
curve_type_obj = curve_type,
int_mode = int_mode,
regul_time_obj = regul_time)
test_results["SmoothPLS", ] = evaluate_results(Y_test, Y_hat_spls)
test_results["NaivePLS", "nb_cp"] = naivePLS_obj$nbCP_opti
test_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
test_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_optitrain_results
#> PRESS RMSE MAE R2 var_Y nb_cp
#> NaivePLS 56851.26 23.84350 18.74728 0.8052915 80.52915 2
#> FPLS 51006.48 22.58462 17.48461 0.8253091 77.95544 2
#> SmoothPLS 54449.52 23.33442 18.54678 0.8135171 111.55514 2
test_results
#> PRESS RMSE MAE R2 var_Y nb_cp
#> NaivePLS 12668.95 22.51128 17.71806 0.7918992 96.46796 2
#> FPLS 13999.53 23.66392 17.75687 0.7700429 91.67738 2
#> SmoothPLS 17130.76 26.17691 20.60632 0.7186092 135.21697 2