Commit 07b26dd9 authored by joerg.buescher's avatar joerg.buescher
Browse files

tidy up after merge

parents 87b69a5b fe4cf45f
No preview for this file type
.Rproj.user
.Rhistory
.DS_Store
inst/.DS_Store
Package: automRm
Title: Fully automatic processing of LC-QQQ data from .mzML to .xlsx
Version: 0.0.0.9000
Version: 0.1.0
Authors@R:
person(given = "Joerg",
family = "Buescher",
role = c("aut", "cre"),
email = "buescher@ie-freiburg.mpg.de",
comment = c(ORCID = "0000-0002-6547-0076")
person(given = "Daniel",
comment = c(ORCID = "0000-0002-6547-0076"))
person(given = "Daniel",
family = "Eilertz",
role = c("aut", "cre"),
email = "first.last@example.com",
comment = c(ORCID = "YOUR-ORCID-ID"))
email = "eilertz@ie-freiburg.mpg.de")
Description: automRm processes LC-QQQ raw data in the open .mzML format to obtain a user-friendly output of signal intensities for every analyte and every sample in .xlsx format. In addition to the .mzML files a matching list of target metabolites must be available. automRm then uses 2 random forest models to 1st. decide which chromatographic peak is the most likely to represent an analyte and 2nd to decide if the data is of sufficient quality to be given to a person with little metabolomics experience. Both random forest models can easily be trained to meet one's LC-MS methods and quality standards.
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Imports:
caret (>= 6.0-86),
e1071 (>= 1.7-3),
kernlab (>= 0.9-27),
mgcv(>= 1.8-28),
mzR (>= 2.20.0),
neuralnet (>= 1.44.2),
nnls (>= 1.4),
openxlsx (>= 4.1.4),
robustbase(>= 0.93-6),
zoo (>= 1.8-7)
\ No newline at end of file
randomForest (>= 4.6-14),
robustbase (>= 0.93-6),
xgboost (>= 0.82.1),
zoo (>= 1.8-7)
# Generated by roxygen2: do not edit by hand
export(initialize_prm)
export(process_batch)
export(train_model)
#' automRm: Taking LC-QQQ peak picking to the next level
#'
#' automRm processes LC-QQQ raw data in the open .mzML format to obtain a user-friendly output of signal intensities for every analyte and every sample in .xlsx format. \cr \cr
#' In addition to the .mzML files a matching list of target metabolites must be available. automRm then uses 2 random forest models to 1st. decide which chromatographic peak is the most likely to represent an analyte and 2nd to decide if the data is of sufficient quality to be given to a person with little metabolomics experience.
#' Both random forest models can easily be trained to meet one's LC-MS methods and quality standards.
#'
#'
#' @section automRm functions:
#' @param train_model Train first and/or second model for automated peak picking.
#' @param initialize_prm Load and assign global default varariables. Default variables can be overwritten by the parameters argument when calling process_batch or using edited update_prm.tsv files in working folder).
#' @param process_batch Automated analysis of provided .mzML files.
#'
#' @docType package
#' @name automRm
NULL
#> NULL
compressplot <- function(y) {
# function to reduce amount of data to be plotted for faster loading of overview plots
if (length(y) < 10) {
z <- y
} else {
z <- rep(NA,length(y))
napos <- is.na(y)
y[napos] <- 0
maxy <- max(y,na.rm = TRUE)
z[1] <- y[1]
refy <- y[1]
omitcount <- 0
beforesign <- 1
for (ix in 2:length(y)) {
nowsign <- sign(y[ix]-y[ix-1])
if (abs(y[ix] - refy) > (0.03 * maxy)) { # if difference from last printed value is too large
z[ix-round(omitcount/2)] <- mean(y[(ix-omitcount):ix], na.rm = TRUE)
omitcount <- 0
refy <- y[ix]
} else if (omitcount == 20) { # after 20 omitted points draw average of points
z[ix-10] <- mean(y[(ix-19):ix], na.rm = TRUE)
omitcount <- 0
refy <- y[ix]
} else if (nowsign != beforesign) { # draw minima and maxima
z[ix] <- y[ix]
omitcount <- 0
refy <- y[ix]
} else { # otherwise omit becasue there is a neglectible difference
z[ix] <- NA
omitcount <- omitcount + 1
}
beforesign <- nowsign
}
z[ix] <- y[ix] # always use last point
z[napos] <- NA
} # endif length(y) < 10
z
} # endfunction
consensus_peaks_update <- function(metab, smpl, msd, prm) {
consensus_peaks_update <- function(metab, smpl, msd, prm) {
print(' ')
# find ref sample
rtshiftwheight <- rtshift <- rtshift2 <- matrix(nrow = prm$nsmpl, ncol=prm$nmet) # topcand <- candrt <- bestqs
pstartmat <- pendmat <- rtshiftwheight <- rtshift <- rtshift2 <- matrix(nrow = prm$nsmpl, ncol=prm$nmet)
#number of quality scores (get it from first peak of first metabolite in first sample)
n_qs <- length(msd[[1]][[1]]$qs_list)
......@@ -15,7 +15,7 @@ colnames(qslog) <- c('Sample', 'Metabolite', names(msd[[1]][[1]]$qs_list), 'Outp
# find best scoring sample as the one where the sum of qs is highest across all metabolites
bestqs <- numeric(prm$nsmpl)
for (id in 1:prm$nsmpl) {
for (id in 1:prm$nsmpl) {
bestqs[id] <- sum( unlist(sapply(msd, '[[', id)['qs', ]) , na.rm = TRUE)
}
refsmpl <- which.max(bestqs)[1]
......@@ -23,15 +23,14 @@ refsmpl <- which.max(bestqs)[1]
# find shift relative to refsmpl
corweight <- list()
for (id in 1:prm$nsmpl) {
#id <- 17
corweight[[id]] <- list()
for (im in 1:prm$nmet) {
if (prm$verbose >= 2){
cat('\r', paste('consensus_peaks: initial rt shift for sample ', id, 'metab', im, ' '))
cat('\r', paste('consensus_peaks: initial rt shift for sample ', id, 'metab', im, ' '))
}
# the x/x12/x13 values in msd have been smoothed, so let's go back to the original data.
nd <- get_ref_chromatograms(metab, msd, smpl, id, refsmpl, im)
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < prm$timewindow)
shiftrangecrude <- prm$tshiftcrude
if (prm$sosomets) { # larger shift range for soso mets
......@@ -39,15 +38,15 @@ for (id in 1:prm$nsmpl) {
}
shiftmatch <- shiftcor <- numeric(length(shiftrangecrude))
# wheight traces by log(max(x1)) to reduce impact of absent compounds
corweight[[id]][[im]] <- log(c(max(nd$x1[tr],na.rm = TRUE),
max(nd$x2[tr],na.rm = TRUE),
corweight[[id]][[im]] <- log(c(max(nd$x1[tr],na.rm = TRUE),
max(nd$x2[tr],na.rm = TRUE),
max(nd$x3[tr],na.rm = TRUE))+10,10 )
for (it in 1:length(shiftrangecrude)) { # use=complete.obs is equivalent to na.rm=TRUE in other functions
goodt <- which((tr+shiftrangecrude[it]) > 0 )
if (sum(!is.na(nd$refx1[tr[goodt]] * nd$x1[tr[goodt]+shiftrangecrude[it]])) > 10) { # need at least 10 points to compute correlation
now3cor <- c(cor(nd$refx1[tr[goodt]], nd$x1[tr[goodt]+shiftrangecrude[it]], use="pairwise.complete.obs"),#"complete.obs"),
cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+shiftrangecrude[it]], use="pairwise.complete.obs"),#"complete.obs"),
cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+shiftrangecrude[it]], use="pairwise.complete.obs"),#"complete.obs"),
cor(nd$refx3[tr[goodt]], nd$x3[tr[goodt]+shiftrangecrude[it]], use="pairwise.complete.obs"))#"complete.obs"))
shiftmatch[it] <- sum(corweight[[id]][[im]] * now3cor, na.rm = TRUE)
} else {
......@@ -62,16 +61,16 @@ for (id in 1:prm$nsmpl) {
# fine-adjust rt shift only for really polar
if (prm$polarmets && !prm$sosomets){
if (prm$verbose >=2 ){
pdf(file = "shiftplots.pdf", height = 6, width = 6, family = "Helvetica")
pdf(file = file.path(prm$batchdir, "shiftplots.pdf"), height = 6, width = 6, family = "Helvetica")
}
fitqual <- numeric()
for (id in 1:prm$nsmpl) {
if (prm$verbose >= 2){
cat('\r', paste('consensus_peaks: second rt shift for sample ', id, ' '))
cat('\r', paste('consensus_peaks: second rt shift for sample ', id, ' '))
}
if (id != refsmpl){
# fit shift for sample as robust quadratic
nowfitdata <- data.frame(shift = rtshift[id, ], rt = sapply(metab,'[[', 'RT')) # metabdb_data$metdb$RT[goodmets])
nowfitdata <- data.frame(shift = rtshift[id, ], rt = sapply(metab,'[[', 'RT'))
if (prm$sosomets) {
# poor man's robust spline fit
noww <- (rtshiftwheight[id, ]+1) * 1/(abs(nowfitdata$shift)+3)
......@@ -87,14 +86,14 @@ if (prm$polarmets && !prm$sosomets){
fitqual[id] <- sum((0.02*nowfit$residuals)^2,na.rm = TRUE)
})
}
if (prm$verbose >= 2 ) {
plot(nowfitdata$rt, nowfitdata$shift, main=paste(smpl$samplenames[id], round(fitqual[id])) )
lines(seq(0,max(nowfitdata$rt,na.rm = TRUE)+1,0.1),
lines(seq(0,max(nowfitdata$rt,na.rm = TRUE)+1,0.1),
predict(nowfit,data.frame(rt=seq(0,max(nowfitdata$rt,na.rm = TRUE)+1,0.1))) ,
col = 'blue')
}
for (im in 1:prm$nmet) {
# im <- 27
nd <- get_ref_chromatograms(metab, msd, smpl, id, refsmpl, im)
......@@ -103,33 +102,39 @@ if (prm$polarmets && !prm$sosomets){
# save predshift for plotting
msd[[im]][[id]]$predshift <- predshift
if (prm$sosomets) { # double fine shift range for soso polar
nowtshift <- predshift + seq(2*min(prm$tshiftfine), 2*max(prm$tshiftfine),1)
} else {
nowtshift <- predshift + prm$tshiftfine
}
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < prm$timewindow)
# smaller time window for second shift
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < (prm$timewindow*0.5))
shiftmatch <- numeric(length(nowtshift))
for (it in 1:length(nowtshift)) {
goodt <- which(tr+nowtshift[it] > 0 )
if (sum(!is.na(nd$refx1[tr[goodt]] * nd$x1[tr[goodt]+nowtshift[it]])) > 10) { # need at least 10 points to compute correlation
shiftmatch[it] <- sum(c(corweight[[id]][[im]][1] * cor(nd$refx1[tr[goodt]], nd$x1[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),#"complete.obs"),
corweight[[id]][[im]][2] * cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),#"complete.obs"),
corweight[[id]][[im]][3] * cor(nd$refx3[tr[goodt]], nd$x3[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs")), na.rm=TRUE)#"complete.obs")), na.rm = TRUE)
shiftmatch[it] <- sum(c(corweight[[id]][[im]][1] * cor(nd$refx1[tr[goodt]], nd$x1[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),
corweight[[id]][[im]][2] * cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),
corweight[[id]][[im]][3] * cor(nd$refx3[tr[goodt]], nd$x3[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs")), na.rm=TRUE)
} else {
shiftmatch[it] <- 0
}
}
# just in case there are multiple optima, prefer the one that is closest to the predicted shift
var <- 20*(max(prm$tshiftfine) - min(prm$tshiftfine))
profile <- (exp(-(prm$tshiftfine)^2/(2*var)) / sqrt(2*pi*var))
shiftmatch <- (shiftmatch-min(shiftmatch))*profile
rtshift2[id,im] <- nowtshift[which.max(shiftmatch)]
} # for all goodmets
} # for all goodmets
} else {
rtshift2[id, ] <- rtshift[id, ]
}
} # for all samples
if(prm$verbose >=2){
dev.off() # close pdf
dev.off() # close pdf
}
} else { # endif only for polar
fitqual <- numeric(prm$nsmpl)
......@@ -142,12 +147,15 @@ for (im in 1:prm$nmet) {
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < prm$timewindow)
pmat1 <- pmat2 <- pmat3 <- matrix(nrow = prm$nsmpl, ncol = length(tr))
refx1 <- smpl$chroms[[refsmpl]][[ metab[[im]]$quant$MRM ]] # mzml_data$chromdata[[refsmpl]][[metabdb_data$quantifier[[im]]$MRM]]
if (!is.null(refx1)) {
# currently we only use the timewindow around the expexted RT that we used for rtshift adjustment
# for the creation of the prototype. Consequently, only peaks within this timewindow can be found
# One could test to use the whole chromatogram instead. However the alignment of parts outside the
# timewindow will probably be bad.
refx1 <- smpl$chroms[[refsmpl]][[ metab[[im]]$quant$MRM ]]
if ((!is.null(refx1)) && (sum(!is.na(refx1[tr])) > 20)) {
# currently we only use the time window around the expected RT
# that we used for rtshift adjustment for the creation of the
# prototype. Consequently, only peaks within this time window
# can be found. One could test to use the whole chromatogram
# instead. However the alignment of parts outside the
# time window will probably be bad.
for (id in 1:prm$nsmpl) {
if (prm$verbose >= 2){
cat('\r', paste('consensus_peaks: metab', im, 'sample', id, ' ')) # much faster than print(id)
......@@ -172,7 +180,6 @@ for (im in 1:prm$nmet) {
pmat3[id, goodt] <- nd$x3[tr[goodt]+rtshift2[id,im]]
}
}
# print(foundtrace)
} # endfor all samples
# take average of all aligned chromatograms for each trace as prototype chromatogram
......@@ -185,13 +192,13 @@ for (im in 1:prm$nmet) {
p3[is.na(p3)] <- 0
# peak detection
peaknum <- 3
# determine candidates and their qs scores
p0 <- zoo::rollmean(rowSums(cbind(p1/max(p1,na.rm = TRUE), p2/max(p2,na.rm = TRUE), p3/max(p3,na.rm = TRUE)), na.rm = TRUE), 3, fill=0, align='center')
sp0 <- diff(p0)
maxsp0 <- max(abs(sp0))
local_max <- which(diff(sign(sp0))==-2)
peaktops <- local_max[order(p0[local_max], decreasing=T)][1:peaknum]
peaktops <- local_max[order(p0[local_max], decreasing=T)]
peaknum <- length(peaktops)
peakstart <- peakend <- numeric(peaknum)
for (ip in 1:peaknum){
if (!is.na(peaktops[ip])){
......@@ -204,8 +211,9 @@ for (im in 1:prm$nmet) {
sp0 <- diff(p0)
maxsp0 <- max(abs(sp0))
local_max <- which(diff(sign(sp0))==-2)
peaktops <- c(peaktops, local_max[order(p0[local_max], decreasing=T)][1:peaknum])
for(ip in 1:peaknum){
peaktops <- c(peaktops, local_max[order(p0[local_max], decreasing=T)])
peaknum2 <- length(peaktops) - peaknum
for (ip in 1:peaknum2){
if (!is.na(peaktops[peaknum+ip])) {
peakstart[peaknum+ip] <- max(c(which(sp0[1: max(c(1,(peaktops[peaknum+ip]-3))) ] < 0.01*maxsp0),1))
peakend[peaknum+ip] <- peaktops[peaknum+ip]+3+min(which(-sp0[(peaktops[peaknum+ip]+3):length(sp0)] < 0.01*maxsp0), length(p0))
......@@ -213,34 +221,34 @@ for (im in 1:prm$nmet) {
}
peaktops <- peaktops[!is.na(peaktops)]
# determine quality scores on original p0
qsmat <- as.data.frame(matrix(nrow = (2*peaknum), ncol = n_qs))
colnames(qsmat) <- names(msd[[im]][[id]]$qs_list)
# get quality scores generated by peak_detection and
# find the consensus peak that represents the highest average quality score
originalrt <- unlist(sapply(msd[[im]], '[[', 'RT'))
originalqs <- unlist(sapply(msd[[im]], '[[', 'qs'))
metabshift <- 1/(prm$samplingfrequency * 60) * rtshift2[ ,im]
peakoldqs <- numeric(length(peaktops))
for (ip in 1:length(peaktops)) {
qsmat[ip, ] <- get_quality_scores(p1, p2, p3, peakstart[ip], peaktops[ip], peakend[ip], nowusetrace, im, metab, prm, rangestart = tr[1])
if (!is.na(peakstart[ip]) && !is.na(peakend[ip])) {
nowpstart <- prm$unirt[min(max(c(0, peakstart[ip] + tr[1])), (prm$nscan-10))]
nowpend <- prm$unirt[max(c(3, min(c(prm$nscan, peakend[ip] + tr[1])) )) ]
peakoldqs[ip] <- sum(originalqs[intersect(which((originalrt - metabshift) > nowpstart),
which((originalrt - metabshift) < nowpend))]) /
sqrt(peakend[ip] - peakstart[ip])
}
}
peakoldqs[is.na(peakoldqs)] <- 0
# Get best peak
if (prm$ml_type == 'mltrain_initial'){
# select best candidate by RF model
pred_qs <- rowMeans(qsmat)
bestcand <- which(pred_qs == max(pred_qs))
}else{
# Transform input data
trans <- predict(prm$train_preprocessing, as.data.frame(qsmat) )
# select best candidate by RF model
pred_qs <- predict(prm$model_rf, trans, type="prob")
bestcand <- which(pred_qs$H == max(pred_qs$H ))[1] # take first candidate if two bestcands are present
}
bestcand <- which(peakoldqs == max(peakoldqs))
bestcand <- bestcand[1]
pstart <- peakstart[bestcand]
pend <- peakend[bestcand]
# propagate peak start and end to all measurements
for (id in 1:prm$nsmpl) {
# id <- 1
nowpstart <- min(max(c(0, pstart + rtshift2[id,im] + tr[1])), (prm$nscan-10))
nowpend <- min(c(prm$nscan, pend + rtshift2[id,im] + tr[1]))
nowpend <- max(c(3, min(c(prm$nscan, pend + rtshift2[id,im] + tr[1])) ))
# micro-adjust start and end
nowmicrorangestart <- nowpstart+prm$microsearch
......@@ -287,14 +295,19 @@ for (im in 1:prm$nmet) {
final_peak_class$H <- mean(as.numeric(nowqs))
}else{
trans <- predict(prm$train_preprocessing, as.data.frame(nowqs) )
final_peak_class <- predict(prm$model_rf, trans,type="prob")
final_peak_class <- predict(prm$model, trans,type="prob")
}
# log quality scores for ML training
qslog[prm$nmet*(id-1)+im, ] <- c(smpl$samplenames[id],
metab[[im]]$name ,
as.numeric(nowqs),
final_peak_class$H)
# collect data for template for training solutions
pstartmat[id, im] <- nowpstart
pendmat[id, im] <- nowpend
#Set peak parameters
msd[[im]][[id]]$qs_list <- nowqs
msd[[im]][[id]]$qs <- final_peak_class$H
......@@ -307,34 +320,74 @@ for (im in 1:prm$nmet) {
msd[[im]][[id]]$ylim <- c(0,1.1*max(c(100,nd$x1[nowptop],nowusetrace[2]*nd$x2[nowptop],nowusetrace[3]*nd$x3[nowptop]),na.rm = TRUE))
msd[[im]][[id]]$label1 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'QS = ', round(final_peak_class$H*100) )
msd[[im]][[id]]$label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'Cor:', round(nowqs$QS_cor123, 2),
' T/O:', round(nowqs$QS_T.o, 2),
' T/Bmax:', round(nowqs$QS_T.h, 2),
' T/Bmin:', round(nowqs$QS_T.l, 2),
' dRT:', round(nowqs$QS_dRT, 2),
' width:', round(nowqs$QS_w, 2),
' Cor 12c:', round(nowqs$QS_cor12, 2),
' Cor 13c:', round(nowqs$QS_cor13, 2),
' Cor 12c/13c: ', round(nowqs$QS_cor23, 2),
'\n',
' PInt: ', round(nowqs$QS_height, 2),
' relRT:' , round(nowqs$QS_relRT, 2),
' T/O 12c:' , round(nowqs$QS_T.o2, 2),
' T/O 13c:' , round(nowqs$QS_T.o3, 2),
' dRT_shift:' , round(nowqs$QS_dRTs, 2),
'QS_pcand = ', round(final_peak_class$H*100) )
label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n')
qsnames <- names(nowqs)
nowcount <- 0
for (iq in 1:length(nowqs) ) {
nowcount <- nowcount + 1
label2 <- paste0( label2, substr(qsnames[iq], 4, nchar(qsnames[iq]) ) , ': ', as.character(round(nowqs[[qsnames[iq]]],2) ), ' ')
if ((nowcount > 7) && (iq < length(nowqs))) {
label2 <- paste0(label2, '\n')
nowcount <- 0
}
}
msd[[im]][[id]]$label2 <- paste0(label2,
'\n',
' rf_final:' , final_peak_class$H)
' ML_pcand: ' , round(final_peak_class$H,2))
} # endfor all samples
} # endif metabdb_data$quantifier was measured in refsample
} # endfor all metabolites
# Write training data to tsv (quality scores for 2. model)
write.table(qslog, file = 'qslog.tsv', sep = '\t', row.names = FALSE)
if (prm$verbose >= 1) {
# write log of quality scores for training to tsv file
write.table(qslog, file = file.path(prm$batchdir, 'qslog.tsv'), sep = '\t', row.names = FALSE)
nowscore <- matrix(nrow = prm$nsmpl, ncol = prm$nmet, data = NA)
# use old scores if available and peaks match
if (file.exists(file.path(prm$batchdir, "manual_peakcheck.xlsx"))) {
oldcheck <- openxlsx::read.xlsx(file.path(prm$batchdir, "manual_peakcheck.xlsx"), colNames = FALSE)
for (id in 1:prm$nsmpl) {
spos <- which(oldcheck[ ,1] == smpl$filenames[id])
if (length(spos) == 1) {
for (im in 1:prm$nmet) {
mpos <- which(oldcheck[1, ] == metab[[im]]$name)
if (length(mpos) == 1) {
if (!any(is.na(c(pstartmat[id ,im],pendmat[id ,im] )))) {
if ( (abs(as.numeric(oldcheck[spos, mpos+1]) - prm$unirt[pstartmat[id ,im]] ) < 0.1) &&
(abs(as.numeric(oldcheck[spos, mpos+2]) - prm$unirt[pendmat[id ,im]] ) < 0.1) ) {
nowscore[id,im] <- oldcheck[spos, mpos]
}
}
}
}
}
}
}
# write template for training solution to xlsx file
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, 'Sheet1')
openxlsx::writeData(wb, 'Sheet1', smpl$filenames[prm$typeorder], startCol = 1, startRow = 4, rowNames = FALSE, colNames = FALSE)
for (im in 1:prm$nmet) {
nowcol <- 3*im
openxlsx::writeData(wb, 'Sheet1', metab[[im]]$name, startCol = nowcol, startRow = 1, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', t(c('Score', 'Start', 'End')) , startCol = nowcol, startRow = 2, rowNames = FALSE, colNames = FALSE)
options("openxlsx.numFmt" = "0")
openxlsx::writeData(wb, 'Sheet1', nowscore[prm$typeorder, im] , startCol = nowcol, startRow = 4, rowNames = FALSE, colNames = FALSE)
options("openxlsx.numFmt" = "0.00")
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pstartmat[prm$typeorder ,im]] , startCol = nowcol+1, startRow = 4, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pendmat[prm$typeorder ,im]] , startCol = nowcol+2, startRow = 4, rowNames = FALSE, colNames = FALSE)
}
openxlsx::saveWorkbook(wb, file.path(prm$batchdir, "manual_peakcheck_template.xlsx"), overwrite = TRUE)
}
msd # return msd
......
evaluate_peaks <- function(msd, prm){
# Function to evaluate peaks based on final random forest model
# In Trainig mode: all peak candidates are reported
# Quantiles of RF scores for each batch calculated
# Final random forest model predicts peak scores based on each peaks QS + rf score quantiles
# Updates msd:
# - new label with report flag 'rep'
# - new variable per sample*metabolite 'report' with score probability
print(' ')
trainingflag <- (substr(prm$ml_type,1,7) == 'mltrain') # only compute once for speed
if ((prm$verbose >=2) && trainingflag ) {
cat('\r', 'evaluate_peaks: Setting default values (report <- TRUE) in training mode')
}
for (im in 1:prm$nmet){
# calculate quantiles for each batch
ml_quantile <- quantile(sapply(msd[[im]], '[[', 'qs'))
names(ml_quantile) <- paste0("RF",seq(0,100,25))
for (id in 1:prm$nsmpl){
# Set report flag to default if in ml training mode
if (trainingflag) {
msd[[im]][[id]]$report <- TRUE
next
}
if (prm$verbose >=2) {
cat('\r', paste('evaluate_peaks: metab', im, 'sample', id, ' '))
}
nowquantile <- c(data.frame('Output_H' = msd[[im]][[id]]$qs), ml_quantile)
# predict final peak score
final_data <- as.data.frame(c(msd[[im]][[id]]$qs_list, nowquantile))
final_data_prep <- predict(prm$train_preprocessing_final,final_data)
final_pred <- predict(prm$model_final,final_data_prep,type="prob")$H
#update plotdata label1
nowstartpos <- gregexpr('\n', msd[[im]][[id]]$label1)[[1]][2]
msd[[im]][[id]]$label1 <- paste0(substr(msd[[im]][[id]]$label1, 1, nowstartpos), 'QS: ', round(100*final_pred,0) )
#update plotdata label2
quantile_label <- paste0(names(ml_quantile),": ", round(ml_quantile,2), collapse=" ")
new_label <- paste0(paste(" rep: ", round(final_pred,3),"\n",quantile_label) )
msd[[im]][[id]]$label2 <- paste(msd[[im]][[id]]$label2, new_label)
nowchecksum <- sum(msd[[im]][[id]]$foundchrom)
# Set report variable and report value
msd[[im]][[id]]$report <- ((final_pred >= 0.5) && (nowchecksum > 1))
}
}
# Return updated data
msd
}# endfunction
evaluate_peaks_rf <- function(msd, prm){
# Function to evaluate peaks based on final random forest model
# In Trainig mode: all peak candiates are reported
# In Training mode: all peak candidates are reported
# Quantiles of RF scores for each batch calculated
# Final random forest model predicts peak scores based on each peaks QS + rf score quantiles
# Final random forest model predicts peak scores based on each peaks QS + rf score quantiles
# Updates msd:
# - new label with report flag 'rep'
# - new variable per sample*metabolite 'report' with score probability
print(' ')
trainingflag <- (substr(prm$ml_type,1,7) == 'mltrain') # only compute once for speed