Abstract
This document aims at simulating data with a DBV-SBV model for varietal mixtures (microplot-based design), fitting it, and checking the estimates.Dependencies:
suppressPackageStartupMessages(library(plantmix))
suppressPackageStartupMessages(library(ggplot2))
set.seed(12345)
nbGenos <- 200
levGenos <- sprintf(fmt=paste0("g%0", floor(log10(nbGenos))+1, "i"),
1:nbGenos)
nbSnps <- 1000
levSnps <- sprintf(fmt=paste0("s%0", floor(log10(nbSnps))+1, "i"),
1:nbSnps)
nb_pops <- 10
weak_div_pops <- diag(nb_pops)
weak_div_pops[upper.tri(weak_div_pops)] <- 0.9
weak_div_pops[lower.tri(weak_div_pops)] <- weak_div_pops[upper.tri(weak_div_pops)]
tmp <- rep(nbGenos / nb_pops, nb_pops - 1)
tmp <- c(tmp, nbGenos - sum(tmp))
snpGenos <- simulGenosDoseStruct(nb_genos=tmp,
nb_snps=nbSnps,
div_pops=weak_div_pops,
geno_IDs=levGenos,
snp_IDs=levSnps)
dim(snpGenos)
## [1] 200 1000
snpGenos[1:3,1:4]
## s0001 s0002 s0003 s0004
## g001 1 1 1 1
## g002 1 0 1 1
## g003 0 1 2 0
table(snpGenos)
## snpGenos
## 0 1 2
## 61799 80252 57949
For simplicity, the first estimator of VanRaden (2008) is used, that assumes linkage equilibrium and Hardy-Weinberg equilibrium.
snpAFs <- colSums(snpGenos) / (2 * nrow(snpGenos))
GRM <- estimGRM(snpGenos, snpAFs)
GRM <- as.matrix(Matrix::nearPD(GRM)$mat)
image(Matrix(GRM), main="GRM")
hist(diag(GRM), main="GRM")
hist(GRM[upper.tri(GRM)], main="GRM")
In this vignette, binary varietal mixtures (of equal proportions) are assembled from this panel of 200 genotypes. Only 1000 mixtures are observed, among all 1.99^{4} possible ones, i.e., approximately 5%.
The incomplete (sparse) design is optimized so that it is balanced: all genotype are observed in the same number of mixtures.
The field trial contains these mixtures as well as monovarietals, and is organized as a randomized complete block design.
The yield data are generated according to the following equations, using the vec operator:
varietal mixtures: \(vec(Y_{mix}) = X_{mix} \, \boldsymbol{\beta}_{mix} + Z_{DS,mix} \, \boldsymbol{vec(BV_{DS})} + Z_{D{\times}S} \, \boldsymbol{vec(DBV{\times}SBV)} + \boldsymbol{\epsilon}_{mix}\)
monovarietals (“pure” stands): not (yet) simulated
Notations:
\(\boldsymbol{y}_{mix} := vec(Y_{mix})\) with \(Y_{mix}\) being the yield matrix with one row per microplot and two columns, one per genotype in the mixtures
\(\boldsymbol{\beta}_{mix}\): vector of fixed effects (blocks)
\(BV_{DS} \sim \mathcal{N}(O, K, \Sigma_{DS})\)
\(BV_{DS}\): matrix whose first column contains the DBVs and second column the SBVs
\(O\): matrix of zeroes of the same dimensions as \(BV_{DS}\)
\(K\): genetic similarity (kinship) matrix between genotypes
\(\Sigma_{DS}\): genetic covariance matrix between DBVs and SBVs
\(DBV{\times}SBV \sim \mathcal{N}(O, \; \sigma_{D{\times}S} \, K, \; \sigma_{D{\times}S} \, K)\)
\(DBV{\times}SBV\): matrix of DBV-SBV interactions, with focals in rows and neighbors in columns
\(\sigma_{D{\times}S}^2\): variance component of DBV-SBV interactions
\(\boldsymbol{\epsilon}_{mix} := vec(E_{mix})\) where \(E_{mix} \sim \mathcal{N}(O, \text{Id}, \Sigma_E)\)
\(O\): matrix of zeroes of the same dimensions as \(E\)
\(\Sigma_E\): covariance matrix between errors within the same microplot
Assemble the mixtures by optimizing the design, and generate the phenotypes (for the moment, K is ignored when drawing DBVxSBV, and hence will be ignored in the inference, too, see below):
out <- simulDBVSBVintraMicroplots(GRM, nbMixes = 1000)
names(out)
## [1] "design" "truth" "datL" "datW"
tmp <- list2env(out, envir = environment())
Plot the design:
design <- out$design
plotDesignVarMix(design$graph, levGenos, subplots = "diallel")
To check that the design is balanced, here is how to quickly check the number of mixtures in which each genotype is present:
head(design$combs)
## comp1 comp2
## 1 g004 g006
## 2 g006 g007
## 3 g006 g008
## 4 g003 g009
## 5 g004 g011
## 6 g009 g013
mixtures <- getMixtureList(design$combs)
str(mixtures, list.len=6)
## List of 1000
## $ 1 : chr [1:2] "g004" "g006"
## $ 2 : chr [1:2] "g006" "g007"
## $ 3 : chr [1:2] "g006" "g008"
## $ 4 : chr [1:2] "g003" "g009"
## $ 5 : chr [1:2] "g004" "g011"
## $ 6 : chr [1:2] "g009" "g013"
## [list output truncated]
geno2mixes <- getMixturesPerGeno(mixtures)
geno2mixes[1:3]
## $g001
## [1] "22" "23" "76" "79" "127" "325" "602" "853" "993" "996"
##
## $g002
## [1] "26" "96" "294" "730" "873" "879" "890" "913" "950" "992"
##
## $g003
## [1] "4" "40" "103" "194" "226" "277" "363" "914" "949" "998"
table(sapply(geno2mixes, length))
##
## 10
## 200
str(datW)
## 'data.frame': 2400 obs. of 15 variables:
## $ ID : Factor w/ 1200 levels "g001","g001+g029",..: 35 55 56 25 36 85 65 66 48 86 ...
## $ geno_1 : Factor w/ 200 levels "g001","g002",..: 4 6 6 3 4 9 7 7 5 9 ...
## $ geno_2 : Factor w/ 192 levels "g005","g006",..: 2 3 4 5 6 7 8 10 11 12 ...
## $ props : chr "0.5+0.5" "0.5+0.5" "0.5+0.5" "0.5+0.5" ...
## $ stand : Factor w/ 2 levels "mono","mix": 2 2 2 2 2 2 2 2 2 2 ...
## $ block : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
## $ true_fix_yield_1: num 36.3 36.3 36.3 36.3 36.3 ...
## $ true_fix_yield_2: num 36.3 36.3 36.3 36.3 36.3 ...
## $ true_gen_yield_1: num 0.21 -1.709 0.206 -4.118 -1.517 ...
## $ true_gen_yield_2: num 0.564 2.031 3.386 4.401 2.916 ...
## $ true_yield_1 : num 36.5 34.6 36.5 32.1 34.7 ...
## $ true_yield_2 : num 36.8 38.3 39.6 40.7 39.2 ...
## $ yield_1 : num 34.9 39.8 39.9 33.9 33.8 ...
## $ yield_2 : num 38.2 36 35.8 48 38.8 ...
## $ yield : num 73.1 75.9 75.7 81.9 72.7 ...
table(datW$stand)
##
## mono mix
## 400 2000
str(datL)
## 'data.frame': 4400 obs. of 14 variables:
## $ focal : Factor w/ 200 levels "g001","g002",..: 4 6 6 7 6 8 3 9 4 11 ...
## $ neighbor : Factor w/ 200 levels "g001","g002",..: 6 4 7 6 8 6 9 3 11 4 ...
## $ ID : Factor w/ 1200 levels "g001","g001+g029",..: 35 35 55 55 56 56 25 25 36 36 ...
## $ geno_1 : Factor w/ 200 levels "g001","g002",..: 4 4 6 6 6 6 3 3 4 4 ...
## $ geno_2 : Factor w/ 192 levels "g005","g006",..: 2 2 3 3 4 4 5 5 6 6 ...
## $ props : chr "0.5+0.5" "0.5+0.5" "0.5+0.5" "0.5+0.5" ...
## $ stand : Factor w/ 2 levels "mono","mix": 2 2 2 2 2 2 2 2 2 2 ...
## $ block : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
## $ propF : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ propN : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ true_fix_yield: num 36.3 36.3 36.3 36.3 36.3 ...
## $ true_gen_yield: num 0.21 0.564 -1.709 2.031 0.206 ...
## $ true_yield : num 36.5 36.8 34.6 38.3 36.5 ...
## $ yield : num 34.9 38.2 39.8 36 39.9 ...
head(table(datL$focal, datL$stand, datL$block))
## , , = A
##
##
## mono mix
## g001 1 10
## g002 1 10
## g003 1 10
## g004 1 10
## g005 1 10
## g006 1 10
##
## , , = B
##
##
## mono mix
## g001 1 10
## g002 1 10
## g003 1 10
## g004 1 10
## g005 1 10
## g006 1 10
pos <- position_dodge(1)
ggplot(datL) +
aes(x = stand, y = yield) +
geom_violin(aes(fill = block), trim = FALSE, position = pos) +
geom_boxplot(aes(group = interaction(stand,block)), width = 0.3,
position = pos) +
theme_bw()
Compute relative yields of mixtures (RYMs):
RYMs <- list()
for(block in levels(datW$block)){
RYMs[[block]] <- RYM(droplevels(datW[datW$block == block,]),
colIDcomps = "ID", sep = "+",
colProps = "props", colY = "yield")
}
RYMs <- do.call(rbind, RYMs)
RYMs <- droplevels(RYMs[RYMs$stand == "mix",])
Plot RYMs per block:
ggplot(RYMs) +
aes(x = block, y = RYM) +
geom_hline(yintercept = 1, col = "darkgrey") +
geom_violin(aes(fill = block), trim = FALSE) +
geom_boxplot(width = 0.3) +
theme_bw()
Plot RYMs per mixture:
tmp <- tapply(RYMs$RYM, RYMs$ID, mean)
RYMs$ID <- factor(as.character(RYMs$ID),
levels = names(sort(tmp)))
idx <- seq(1, nlevels(RYMs$ID), by = 10)
ggplot(RYMs) +
aes(x = ID, y = RYM) +
labs(x = paste0(nlevels(RYMs$ID), " mixtures")) +
geom_hline(yintercept = 1, col = "darkgrey") +
geom_point(aes(col = block)) +
geom_point(data = data.frame(ID = names(sort(tmp)),
RYM = sort(tmp)),
col = "black", size = 3) +
guides(x = guide_axis(angle = 45)) +
## scale_x_discrete(breaks = levels(RYMs$ID)[idx]) +
theme_bw() +
theme(
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
Compute relative yields (RYs):
RYs <- list()
for(block in levels(datL$block)){
RYs[[block]] <- RY(droplevels(datL[datL$block == block,]),
colIDstand = "ID", colIDfocal = "focal",
colProp = "propF", colY = "yield")
}
RYs <- do.call(rbind, RYs)
RYs <- droplevels(RYs[RYs$stand == "mix",])
Plot RYs per block:
ggplot(RYs) +
aes(x = block, y = RY) +
geom_hline(yintercept = 1/2, col = "darkgrey") +
geom_violin(aes(fill = block), trim = FALSE) +
geom_boxplot(width = 0.3) +
theme_bw()
Plot RYs per focal genotype:
tmp <- tapply(RYs$RY, RYs$focal, mean)
RYs$focal <- factor(as.character(RYs$focal),
levels = names(sort(tmp)))
idx <- seq(1, nlevels(RYs$focal), by = 5)
ggplot(RYs) +
aes(x = focal, y = RY) +
labs(x = paste0(nlevels(RYs$focal), " focal genotypes")) +
geom_hline(yintercept = 1/2, col = "darkgrey") +
geom_point(aes(col = block)) +
geom_point(data = data.frame(focal = names(sort(tmp)),
RY = sort(tmp)),
col = "black", size = 3) +
guides(x = guide_axis(angle = 45)) +
scale_x_discrete(breaks = levels(RYs$focal)[idx]) +
theme_bw()
idx_mix <- which(datL$stand == "mix")
datLmix <- droplevels(datL[idx_mix,])
listY <- list(y = datLmix$yield)
X <- model.matrix(~ 1 + block, data = datLmix, contrasts = list(block = "contr.sum"))
listZ <- list()
propNs <- as.list(datLmix$propN)
tmp <- mkZintraspe(datLmix,
colIDfocal = "focal", colIDneighbors = "neighbor", sep = "+",
colPfocal = "propF", prop_neighbors = propNs,
genos = levels(datLmix$focal)
)
listZ$Z_DS <- cbind(tmp$Z_D, tmp$Z_S)
props <- Map(function(i) {
setNames(
unlist(datLmix[i, c("propF", "propN")]),
as.character(unlist(datLmix[i, c("focal", "neighbor")]))
)
}, 1:nrow(datLmix))
Z_DxS <- mkZintraspe_DxS(datLmix, props,
colIDfocal = "focal", colIDneighbors = "neighbor", sep = "+",
skipUnusedCols = TRUE
)
## listZ$Z_DxS <- Z_DxS
listVCov <- list(K = GRM[levels(datL$focal), levels(datL$focal)])
system.time(
fit <- fitDBVSBVintra(listY = listY,
X = X,
listZ = listZ,
listVCov = listVCov,
lOptions = list(iter.max = 20),
REML = TRUE, verbose = FALSE))
## user system elapsed
## 19.317 0.044 19.367
ggplot(fit$trace) + aes(x = iter, y = objfn) +
geom_point() + geom_line() + labs(title = "Optimization convergence") +
theme_bw()
checks <- data.frame(
param = rownames(fit$beta),
truth = truth$blTrueContrs_mix,
fit$beta,
row.names = NULL,
check.names = FALSE
)
checks$nBE <- normBiasError(checks$Estimate, checks$truth)
print(checks)
## param truth Estimate Std. Error z value Pr(>|z^2|) nBE
## 1 (Intercept) 35.000000 35.430253 0.0522177 678.51039 0.000000e+00 1.2292945
## 2 block1 1.259489 1.248004 0.0522177 23.90001 3.061876e-126 -0.9118796
if ("var_DxS" %in% names(fit)) {
checks <- data.frame(
param = c(rownames(fit$vcov_BV), "var_DxS"),
truth = c(
truth$var_DBV,
truth$var_SBV,
truth$cor_DBV_SBV,
truth$var_DBVxSBV
),
rbind(fit$vcov_BV, fit$var_DxS),
row.names = NULL,
check.names = FALSE
)
} else {
checks <- data.frame(
param = rownames(fit$vcov_BV),
truth = c(
truth$var_DBV,
truth$var_SBV,
truth$cor_DBV_SBV
),
fit$vcov_BV,
row.names = NULL,
check.names = FALSE
)
}
checks$nBE <- normBiasError(checks$Estimate, checks$truth)
print(checks)
## param truth Estimate Std. Error z value Pr(>|z^2|) nBE
## 1 var_DBV 31.360 26.8115640 2.87501958 9.325698 1.102556e-20 -14.503941
## 2 var_SBV 6.272 6.2005894 0.90959599 6.816861 9.305127e-12 -1.138562
## 3 cor_BV -0.800 -0.7341821 0.04967031 -14.781106 1.939538e-49 -8.227237
checks <- data.frame(
param = c("var_err", "cor_err"),
truth = c(truth$var_err,
truth$cor_err),
rbind(fit$var_err,
ifelse("Cor_E" %in% names(fit),
fit$Cor_E[2,], rep(NA, 4))),
row.names = NULL,
check.names = FALSE
)
checks$nBE <- normBiasError(checks$Estimate, checks$truth)
print(checks)
## param truth Estimate Std. Error z value Pr(>|z^2|) nBE
## 1 var_err 13.02224 13.0842019 0.3132472 41.7695763 0.0000000 0.4758338
## 2 cor_err -0.20000 -0.1664183 -0.1664183 -0.1664183 -0.1664183 -16.7908621
if ("var_DxS" %in% names(fit)) {
checks <- data.frame(
param = c(
rep(c("DBV","SBV"), each = nrow(truth$BV)),
rep("DBVxSBV", ncol(listZ$Z_DxS))
),
truth = c(
truth$BV[levels(datLmix$focal),],
truth$vecDBVxSBV[colnames(listZ$Z_DxS)]
),
rbind(
fit$DBV,
fit$SBV,
fit$DBVxSBV
),
row.names = NULL,
check.names = FALSE
)
checks$param <- factor(checks$param,
levels = c("DBV", "SBV", "DBVxSBV")
)
} else {
checks <- data.frame(
param = rep(c("DBV","SBV"), each = nrow(truth$BV)),
truth = c(truth$BV[levels(datLmix$focal),]),
rbind(fit$DBV, fit$SBV),
row.names = NULL,
check.names = FALSE
)
checks$param <- factor(checks$param,
levels = c("DBV", "SBV"))
}
checks$nBE <- normBiasError(checks$Estimate, checks$truth)
print(tapply(1:nrow(checks), checks$param, function(idx){
cor(checks$truth[idx], checks$Estimate[idx])
}))
## DBV SBV
## 0.9671248 0.8397493
ggplot(checks) +
aes(x = Estimate, y = truth) +
geom_hline(yintercept = 0, linetype = "dotted") +
geom_vline(xintercept = 0, linetype = "dotted") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
geom_point() +
labs(
title = "Results with mixture-only data",
subtitle = paste0("REML=", fit$REML)
) +
theme_bw() +
facet_wrap(~ param)
TODO
t1 <- proc.time()
t1 - t0
## user system elapsed
## 46.795 0.332 47.088
print(sessionInfo(), locale = FALSE)
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Debian GNU/Linux 12 (bookworm)
##
## Matrix products: default
## BLAS: /usr/local/R/4.5.1/lib/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0 LAPACK version 3.11.0
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plantmix_1.0.3 lme4_1.1-37 Matrix_1.7-3 ggplot2_3.5.2 knitr_1.50
## [6] rmarkdown_2.29
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 dplyr_1.1.4 compiler_4.5.1
## [5] Rcpp_1.1.0 tidyselect_1.2.1 dichromat_2.0-0.1 jquerylib_0.1.4
## [9] splines_4.5.1 scales_1.4.0 boot_1.3-31 yaml_2.3.10
## [13] fastmap_1.2.0 lattice_0.22-7 R6_2.6.1 labeling_0.4.3
## [17] generics_0.1.4 igraph_2.1.4 rbibutils_2.3 MASS_7.3-65
## [21] tibble_3.3.0 nloptr_2.2.1 minqa_1.2.8 TMB_1.9.21
## [25] bslib_0.9.0 pillar_1.11.0 RColorBrewer_1.1-3 rlang_1.1.7
## [29] cachem_1.1.0 xfun_0.52 sass_0.4.10 cli_3.6.5
## [33] withr_3.0.2 magrittr_2.0.3 Rdpack_2.6.4 digest_0.6.37
## [37] grid_4.5.1 lifecycle_1.0.5 nlme_3.1-168 reformulas_0.4.4
## [41] vctrs_0.6.5 evaluate_1.0.4 glue_1.8.0 farver_2.1.2
## [45] tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1