Commit d4be3808 authored by luroth's avatar luroth
Browse files

preparation for publication: minimal example added, code cleanup

parent 544d353a
# Working directory with temperature data
path_home <- './'
path_simulation <- './Simulation/Runs'
setwd(path_home)
# Libraries to use
library(readr)
library(tidyr)
library(purrr)
library(ggplot2)
library(lubridate)
library(gridExtra)
library(broom)
library(zoo)
library(plyr)
library(stringr)
library(SpATS)
library(dplyr)
# Models to load
source("R/Model/Spline_QMER.R")
source("R/Model/Dose_response.R")
source("R/Model/FitSpATS.R")
source("R/Model/FitREML.R")
source("R/Model/Graphs.R")
# Measurement error
sigma_error <- 10
# Plot for control overviews
plot_ids <- c(
"FIP20160002",
"FIP20170009",
"FIP20180013")
## Measurement frequencies
measurement_dates_freq_3d <- c(
format(seq.Date(from = as.Date("2020-03-01"), as.Date("2020-07-20"), by = 3), "%m%d")
)
##################################################################################################################
# Read and prepare data
##################################################################################################################
# Read designs and genotypes where simulated data are based on
df_designs <- read_csv('Simulation/designs.csv')
df_designs <- df_designs %>% mutate(plot.discrete_x = plot.row + if_else(plot.replication > 1, 25, 1),
plot.discrete_y = plot.range + if_else(plot.replication > 1, 22, 1))
df_genotypes <- read_csv('Simulation/genotypes.csv')
# Read truth (validation) data
df_genotypes_yearsite_true <- read_csv(paste0(path_simulation, "/1/genotype_yearsite_params.csv"))
df_genotypes_yearsite_true <- df_genotypes_yearsite_true %>%
pivot_longer(c(start_growth, stop_growth, final_height), names_to = "parameter")
df_genotypes_true <- read_csv(paste0(path_simulation, "/1/genotypes_params.csv"))
df_genotypes_true <- df_genotypes_true %>%
pivot_longer(c(start_growth, stop_growth, bplm_c0, bplm_cOpt, bplm_Asym), names_to = "parameter")
# Read simulated canopy height data
df_trait_values <- read_csv(paste0(path_simulation, "/1/trait_values.csv"), col_types = cols(
plot.UID = col_character(),
method.id = col_double(),
method.name = col_character(),
trait.id = col_double(),
trait.name = col_character(),
trait.label = col_character(),
responsible = col_character(),
timestamp = col_datetime(format = ""),
value = col_double()))
### Merge design and plot information
df_values_for_fit_orig <- inner_join(inner_join(df_trait_values, df_designs, by = "plot.UID"), df_genotypes, by = "genotype.id")
# Reduce data to 3-day measurement interval
df_values_for_fit <- df_values_for_fit_orig %>%
filter(format(timestamp, "%m%d") %in% measurement_dates_freq_3d)
# Add timepoint of preliminar measurement and value delta (growth) to each timepoint
df_values_for_fit <- df_values_for_fit %>%
group_by(plot.UID) %>%
arrange(timestamp) %>%
mutate(lag_timestamp = lag(timestamp), value_delta = value - lag(value))
# Remove first measurements, they do not suite as delta measurement
df_values_for_fit <- df_values_for_fit %>% filter(!is.na(lag_timestamp))
# Calcualte growth rate
df_values_for_fit <- df_values_for_fit %>%
group_by(plot.UID) %>%
mutate(time_diff = difftime(timestamp, lag_timestamp, units = "hours"))
##################################################################################################################
# Fit P-splines (Stage 1)
##################################################################################################################
### Fit p-splines (process may take some minutes!!!)
df_spline_model <- df_values_for_fit %>%
ungroup() %>%
group_by(plot.UID, year_site.UID) %>%
nest() %>%
mutate(spline_model = map(data,
~fit_scam_spline(.$timestamp, .$value)))
# Remove origin data from memory to save some space
df_spline_model <- df_spline_model %>% dplyr::select(-data)
# Predict value with spline model
# 2h interval
time_interval <- 60 * 60 * 12
# Time points in 2h interval
prediction_timepoints <- df_values_for_fit %>%
group_by(year_site.UID) %>%
filter(timestamp > min(timestamp) & timestamp < max(timestamp)) %>%
summarize(prediction_timepoint = list(seq(round_any(min(timestamp), time_interval, f = ceiling),
round_any(max(timestamp) - days(5), time_interval, f = floor),
time_interval)))
# Add design information to plot predictions
predictions <- inner_join(df_spline_model, prediction_timepoints, by = "year_site.UID")
# Predict
df_spline_predicts <- predictions %>%
ungroup() %>%
group_by(plot.UID) %>%
do(spline_predicts = tibble(
predict = predict_scam_spline(.$spline_model[[1]], list(x = .$prediction_timepoint[[1]])),
predict_se = predict_scam_spline(.$spline_model[[1]], list(x = .$prediction_timepoint[[1]]), se = T),
predict_deriv = predict_scam_spline(.$spline_model[[1]], list(x = .$prediction_timepoint[[1]]), deriv = T),
timestamp = .$prediction_timepoint[[1]]))
##################################################################################################################
# Find start/stop and maximum height (QMER method) (Stage 1)
##################################################################################################################
# Apply QMER
df_growth_phase_predicts <- df_spline_predicts %>%
ungroup() %>%
group_by(plot.UID) %>%
mutate(growth_phase_params = map(spline_predicts, find_start_stop_growth_phase))
# Extract predicted values
df_growth_phase_predicts <- df_growth_phase_predicts %>%
unnest(growth_phase_params)
#-----------------------------------------------------------------------------------------------------------------
# Plot results (Splines and QMER parameter for 4 sample plots)
#-----------------------------------------------------------------------------------------------------------------
measurements <- df_values_for_fit %>% filter(plot.UID %in% plot_ids)
predicts <- df_growth_phase_predicts %>%
filter(plot.UID %in% plot_ids) %>%
unnest(spline_predicts)
predicts <- inner_join(predicts, df_designs %>% dplyr::select(plot.UID, genotype.id, year_site.UID), by = "plot.UID")
predicts <- inner_join(predicts, df_genotypes, by = "genotype.id")
plot_heights <- ggplot(data = predicts) +
geom_point(data = measurements, aes(x = timestamp, y = value)) +
geom_line(aes(x = timestamp, y = predict), color = "red") +
geom_vline(aes(xintercept = predict_start_growth, linetype = "t_PH_start"), color = "darkgrey") +
geom_vline(aes(xintercept = predict_stop_growth, linetype = "t_PH_stop"), color = "darkgrey") +
geom_hline(aes(yintercept = predict_final_value, color = "final height")) +
facet_wrap(~year_site.UID, scales = "free_x", nrow = 1) +
scale_x_datetime(NULL, date_breaks = "2 months", date_labels = "%b") +
scale_y_continuous("Canopy height (mm)") +
scale_linetype(NULL, labels = c("t_PH_stop" = expression(paste("stop growth phase (", t[PH[stop]], ")")),
"t_PH_start" = expression(paste("start growth phase (", t[PH[start]], ")")))) +
scale_color_manual(NULL, values = c("final height" = "black"),
labels = c("final height" = expression(paste("final height (", PH[max], ")")))) +
theme +
theme(legend.position = "bottom")
plot(plot_heights)
#-----------------------------------------------------------------------------------------------------------------
# Prepare plot based data in long format, values and se's
df_growth_phase_predicts_stage1 <- df_growth_phase_predicts %>%
mutate(
predict_final_height = predict_final_value,
predict_final_height_se = predict_final_value_se,
predict_start_growth = yday(predict_start_growth),
predict_stop_growth = yday(predict_stop_growth)) %>%
dplyr::select(plot.UID,
predict_final_height, predict_final_height_se,
predict_start_growth, predict_start_growth_se,
predict_stop_growth, predict_stop_growth_se)
df_growth_phase_stage1_values <- df_growth_phase_predicts_stage1 %>%
dplyr::select(plot.UID, predict_start_growth, predict_stop_growth, predict_final_height) %>%
pivot_longer(
cols = c(predict_final_height, predict_start_growth, predict_stop_growth), names_to = "parameter", values_to = "predict")
df_growth_phase_stage1_se <- df_growth_phase_predicts_stage1 %>%
dplyr::select(plot.UID, predict_final_height_se,
predict_start_growth_se, predict_stop_growth_se) %>%
pivot_longer(
cols = c(
predict_start_growth_se, predict_stop_growth_se, predict_final_height_se), names_to = "parameter", values_to = "se") %>%
mutate(parameter = str_sub(parameter, 1, -4))
df_growth_phase_predicts_stage1 <- inner_join(df_growth_phase_stage1_values, df_growth_phase_stage1_se, by = c("plot.UID", "parameter"))
# Remove "prediction_" prefix from parameter names
df_growth_phase_predicts_stage1 <- df_growth_phase_predicts_stage1 %>% mutate(parameter = substr(parameter, 9, 40))
#-----------------------------------------------------------------------------------------------------------------
# Plot results (Extracted parameters versus simulation input)
#-----------------------------------------------------------------------------------------------------------------
df_for_plot <- inner_join(df_growth_phase_predicts_stage1, df_genotypes_yearsite_true)
df_for_plot <- df_for_plot %>% mutate(value_true = value, value = predict)
df_for_plot <- inner_join(df_for_plot, df_designs, by = c("plot.UID", "year_site.UID", "genotype.id"))
plot <- ggplot(data = df_for_plot,
aes(x = value, y = predict)) +
geom_point() +
facet_wrap(parameter ~ year_site.UID, scales = "free")
plot(plot)
#-----------------------------------------------------------------------------------------------------------------
##################################################################################################################
# Calculate adjusted genotype means per year-site (Stage 2, SpATS)
##################################################################################################################
df_all_params <- df_growth_phase_predicts_stage1 %>% mutate(value = predict)
df_all_params <- inner_join(df_all_params, df_designs, by = c("plot.UID", "year_site.UID", "genotype.id"))
df_params_yearsite_weighted <- df_all_params %>%
mutate(year_site.UID_ = year_site.UID) %>%
group_by(year_site.UID_, parameter) %>%
nest() %>%
mutate(BLUEs = map(data, fit_SpATS, parameter), use_weights = TRUE, use_checks = TRUE)
# Extract BLUEs
df_params_BLUES_yearsite_weighted <- df_params_yearsite_weighted %>%
select(-use_weights) %>%
unnest(BLUEs)
df_params_BLUES_yearsite_weighted <- df_params_BLUES_yearsite_weighted %>% select(-data)
##################################################################################################################
# Calculate overall adjusted genotype means per year-site (Stage 3, asreml-R)
##################################################################################################################
# Apply asreml-R
df_BLUEs_weights <- df_params_BLUES_yearsite_weighted %>%
mutate(year_site.UID = year_site.UID_) %>%
group_by(parameter) %>%
nest() %>%
mutate(BLUEs = map(data, fit_REML, parameter = parameter, use_weights = T)) %>%
select(-data) %>%
unnest(BLUEs) %>%
unnest(BLUE)
df_BLUEs_weights <- df_BLUEs_weights %>% rename(predict = BLUE)
#-----------------------------------------------------------------------------------------------------------------
# Plot results (Extracted parameters versus simulation input)
#-----------------------------------------------------------------------------------------------------------------
# Validate with true (simulation input) values
df_BLUES_validate <- inner_join(df_genotypes_true, df_BLUEs_weights, by = c("genotype.id", "parameter"))
plot <- ggplot(data = df_BLUES_validate, aes(value, predict)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
facet_wrap(~parameter, scales = "free")
plot(plot)
df_BLUES_validate %>%
group_by(parameter) %>%
summarise(corr = cor(value, predict))
#-----------------------------------------------------------------------------------------------------------------
\ No newline at end of file
......@@ -130,6 +130,14 @@ predict_scam_spline_posteriors <- function(spline, x_)
# For each realisation, obtain the "fitted" curve
predicts_ <- lp %*% t(sim)
# Plot for test graph
#p <- predict(spline, newdata = x_, se.fit = T)
# plot(x_$x, p$fit, "l", col="red")
# matlines(x_$x, predicts_, col = "black", lty = "solid")
# lines(x_$x, p$fit, col="red")
# lines(x_$x, p$fit + 1.96*p$se.fit, col = 2, lty = 2)
# lines(x_$x, p$fit - 1.96*p$se.fit, col = 2, lty = 2)
predicts <- lapply(seq_len(ncol(predicts_)), function(i) predicts_[,i])
predicts_r <- sapply (predicts, function (x) {length (x) <- nrow(predicts_); return (x)})
......@@ -250,9 +258,10 @@ find_start_stop_growth_phase_posterior <- function(df, text = NA, threshold_star
df_posterior <- mapply(function(X, Y) {tibble(predict = X, predict_deriv=Y, timestamp = df$timestamp, predict_se = df$predict_se)}, X=df_posterior_1, Y=df_posterior_2, SIMPLIFY = F)
predict_posterior <- lapply(df_posterior, find_start_stop_growth_phase, NA, threshold_start, threshold_stop, delta_days, final_height_agg)
df_predict_posterior <- bind_rows(predict_posterior)
df_predict_posterior_se <- df_predict_posterior %>% dplyr::select(-ends_with("se")) %>%
summarise_each(funs(mean,sd,sepost=sd(.)/sqrt(n()))) %>%
dplyr::select(ends_with("sepost"))
summarise(across(.fns=list(mean=mean,sd = sd,sepost= ~ sd(.)/sqrt(n())))) %>%
dplyr::select(ends_with("sepost"))
predict <- bind_cols(predict, df_predict_posterior_se)
print(text)
......
# Working directory with temperature data
path_home <- 'E:/Scripts/htfp_data_processing'
path_simulation <- 'E:/Simulation/Runs_ALL'
path_home <- '/home/luroth/PycharmProjects/htfp_data_processing'
path_simulation <- '/home/luroth/public/Evaluation/Projects/KP0012_luroth/9_phenomics_data_processing/Simulation/sim_runs_POWERPC/Runs_ALL'
setwd(path_home)
......@@ -18,7 +18,7 @@ library(fs)
library(dplyr)
library(multidplyr)
#library(multidplyr)
source("R/Model/Spline_QMER.R")
source("R/Model/Dose_response.R")
......
# Working directory with temperature data
path_home <- 'E:/Scripts/htfp_data_processing'
path_simulation <- 'E:/Simulation/Runs_ALL'
path_home <- '/home/luroth/PycharmProjects/htfp_data_processing'
path_simulation <- '/home/luroth/public/Evaluation/Projects/KP0012_luroth/9_phenomics_data_processing/Simulation/sim_runs_POWERPC/Runs_ALL'
setwd(path_home)
# Libraries to use
library(readr)
library(tidyr)
library(purrr)
......@@ -21,7 +22,7 @@ library(dplyr)
source("R/Model/FitREML.R")
source("R/Model/Graphs.R")
start_run <- 473
start_run <- 22
max_runs <- 500
#500
......@@ -192,7 +193,7 @@ for(run in start_run:max_runs) {
weights = weights_,
na.action = list(y = "include", x = "omit"),
maxit = maxiter,
workspace = "20gb"
workspace = "10gb"
#fail="soft"
)
summary(fit_BLUE)
......@@ -201,6 +202,7 @@ for(run in start_run:max_runs) {
names(df_BLUE_) <- c("genotype.id", "BLUE")
})
if(is.null(fit_BLUE)) {
gc()
df_BLUE_ <- data.frame(genotype.id = df$genotype.id %>% unique(), BLUE =NA)
}
......@@ -219,7 +221,7 @@ for(run in start_run:max_runs) {
weights = weights_post,
na.action = list(y = "include", x = "omit"),
maxit = maxiter,
workspace = "20gb"
workspace = "10gb"
#fail="soft"
)
summary(fit_BLUE)
......@@ -228,6 +230,7 @@ for(run in start_run:max_runs) {
names(df_BLUE_) <- c("genotype.id", "BLUE")
})
if(is.null(fit_BLUE)) {
gc()
df_BLUE_ <- data.frame(genotype.id = df$genotype.id %>% unique(), BLUE =NA)
}
......@@ -245,7 +248,7 @@ for(run in start_run:max_runs) {
residual = ~dsum(~units | year_site.UID),
na.action = list(y = "include", x = "omit"),
maxit = maxiter,
workspace = "20gb"
workspace = "10gb"
#fail="soft"
)
summary(fit_BLUE)
......@@ -254,6 +257,7 @@ for(run in start_run:max_runs) {
names(df_BLUE_) <- c("genotype.id", "BLUE")
})
if(is.null(fit_BLUE)) {
gc()
df_BLUE_ <- data.frame(genotype.id = df$genotype.id %>% unique(), BLUE =NA)
}
......
# Working directory with temperature data
path_home <- 'E:/Scripts/htfp_data_processing'
path_simulation <- 'E:/Simulation/Runs_ALL'
path_home <- '/home/luroth/PycharmProjects/htfp_data_processing'
path_simulation <- '/home/luroth/public/Evaluation/Projects/KP0012_luroth/9_phenomics_data_processing/Simulation/sim_runs_POWERPC/Runs_ALL'
setwd(path_home)
......
# Working directory with temperature data
path_home <- 'C:/Users/luroth/PycharmProjects/htfp_wheat_canopy_height_processing'
path_simulation <- 'E:/Simulation/Runs'
path_home <- '/home/luroth/PycharmProjects/htfp_data_processing'
path_simulation <- '/home/luroth/public/Evaluation/Projects/KP0012_luroth/9_phenomics_data_processing/Simulation/sim_runs_POWERPC/Runs_ALL'
setwd(path_home)
# Libraries to use
......
......@@ -7,6 +7,11 @@ Based on manuscript:
Roth et al. 2021 - Phenomics data processing: A plot-level model for repeated measurements to extract the timing of key stages and quantities at defined time points
(https://doi.org/10.1101/2021.05.02.442243)
## Demo
For a running minimal example see:
```Demo/1_Pspline_QMER_SpATS_asreml.R```
## Stage 1
P-spline and QMER method functions:
......@@ -22,5 +27,10 @@ asreml-R functions:
# HTFP data processing II:
Based on manuscript:
Roth et al. 2021b - Phenomics data processing: Phenomics data processing: Extracting temperature dose-response curves from repeated measurements
(https://doi.org/10.1101/2021.07.23.453040 )
## Dose-Respone curve extraction
```R/Model/Dose_response.R```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment