Commit 37adc58d authored by joerg.buescher's avatar joerg.buescher
Browse files

adding ML type svm and improving identification of best consensus peak. Now...

adding ML type svm and improving identification of best consensus peak. Now the consensus peak that represents the best original peaks identified by peak_detection is chosen
parent d10e09a2
......@@ -21,6 +21,7 @@ RoxygenNote: 7.1.1
Imports:
caret (>= 6.0-86),
e1071 (>= 1.7-3),
kernlab (>= 0.9-27),
mgcv(>= 1.8-28),
mzR (>= 2.20.0),
neuralnet (>= 1.44.2),
......
......@@ -71,7 +71,7 @@ if (prm$polarmets && !prm$sosomets){
}
if (id != refsmpl){
# fit shift for sample as robust quadratic
nowfitdata <- data.frame(shift = rtshift[id, ], rt = sapply(metab,'[[', 'RT')) # metabdb_data$metdb$RT[goodmets])
nowfitdata <- data.frame(shift = rtshift[id, ], rt = sapply(metab,'[[', 'RT'))
if (prm$sosomets) {
# poor man's robust spline fit
noww <- (rtshiftwheight[id, ]+1) * 1/(abs(nowfitdata$shift)+3)
......@@ -109,14 +109,16 @@ if (prm$polarmets && !prm$sosomets){
} else {
nowtshift <- predshift + prm$tshiftfine
}
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < prm$timewindow)
# smaller time window for second shift
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < (prm$timewindow/3))
shiftmatch <- numeric(length(nowtshift))
for (it in 1:length(nowtshift)) {
goodt <- which(tr+nowtshift[it] > 0 )
if (sum(!is.na(nd$refx1[tr[goodt]] * nd$x1[tr[goodt]+nowtshift[it]])) > 10) { # need at least 10 points to compute correlation
shiftmatch[it] <- sum(c(corweight[[id]][[im]][1] * cor(nd$refx1[tr[goodt]], nd$x1[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),#"complete.obs"),
corweight[[id]][[im]][2] * cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),#"complete.obs"),
corweight[[id]][[im]][3] * cor(nd$refx3[tr[goodt]], nd$x3[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs")), na.rm=TRUE)#"complete.obs")), na.rm = TRUE)
shiftmatch[it] <- sum(c(corweight[[id]][[im]][1] * cor(nd$refx1[tr[goodt]], nd$x1[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),
corweight[[id]][[im]][2] * cor(nd$refx2[tr[goodt]], nd$x2[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs"),
corweight[[id]][[im]][3] * cor(nd$refx3[tr[goodt]], nd$x3[tr[goodt]+nowtshift[it]], use="pairwise.complete.obs")), na.rm=TRUE)
} else {
shiftmatch[it] <- 0
}
......@@ -142,13 +144,15 @@ for (im in 1:prm$nmet) {
tr <- which(abs(prm$unirt - metab[[im]]$RT ) < prm$timewindow)
pmat1 <- pmat2 <- pmat3 <- matrix(nrow = prm$nsmpl, ncol = length(tr))
refx1 <- smpl$chroms[[refsmpl]][[ metab[[im]]$quant$MRM ]] # mzml_data$chromdata[[refsmpl]][[metabdb_data$quantifier[[im]]$MRM]]
refx1 <- smpl$chroms[[refsmpl]][[ metab[[im]]$quant$MRM ]]
if ((!is.null(refx1)) && (sum(!is.na(refx1[tr])) > 20)) {
# currently we only use the timewindow around the expexted RT that we used for rtshift adjustment
# for the creation of the prototype. Consequently, only peaks within this timewindow can be found
# One could test to use the whole chromatogram instead. However the alignment of parts outside the
# timewindow will probably be bad.
# currently we only use the time window around the expected RT
# that we used for rtshift adjustment for the creation of the
# prototype. Consequently, only peaks within this time window
# can be found. One could test to use the whole chromatogram
# instead. However the alignment of parts outside the
# time window will probably be bad.
for (id in 1:prm$nsmpl) {
if (prm$verbose >= 2){
cat('\r', paste('consensus_peaks: metab', im, 'sample', id, ' ')) # much faster than print(id)
......@@ -186,13 +190,14 @@ for (im in 1:prm$nmet) {
p3[is.na(p3)] <- 0
# peak detection
peaknum <- 3
# # peaknum <- 3
# determine candidates and their qs scores
p0 <- zoo::rollmean(rowSums(cbind(p1/max(p1,na.rm = TRUE), p2/max(p2,na.rm = TRUE), p3/max(p3,na.rm = TRUE)), na.rm = TRUE), 3, fill=0, align='center')
sp0 <- diff(p0)
maxsp0 <- max(abs(sp0))
local_max <- which(diff(sign(sp0))==-2)
peaktops <- local_max[order(p0[local_max], decreasing=T)][1:peaknum]
peaktops <- local_max[order(p0[local_max], decreasing=T)] #[1:peaknum]
peaknum <- length(peaktops)
peakstart <- peakend <- numeric(peaknum)
for (ip in 1:peaknum){
if (!is.na(peaktops[ip])){
......@@ -205,8 +210,9 @@ for (im in 1:prm$nmet) {
sp0 <- diff(p0)
maxsp0 <- max(abs(sp0))
local_max <- which(diff(sign(sp0))==-2)
peaktops <- c(peaktops, local_max[order(p0[local_max], decreasing=T)][1:peaknum])
for(ip in 1:peaknum){
peaktops <- c(peaktops, local_max[order(p0[local_max], decreasing=T)]) #[1:peaknum])
peaknum2 <- length(peaktops) - peaknum
for (ip in 1:peaknum2){
if (!is.na(peaktops[peaknum+ip])) {
peakstart[peaknum+ip] <- max(c(which(sp0[1: max(c(1,(peaktops[peaknum+ip]-3))) ] < 0.01*maxsp0),1))
peakend[peaknum+ip] <- peaktops[peaknum+ip]+3+min(which(-sp0[(peaktops[peaknum+ip]+3):length(sp0)] < 0.01*maxsp0), length(p0))
......@@ -214,25 +220,45 @@ for (im in 1:prm$nmet) {
}
peaktops <- peaktops[!is.na(peaktops)]
# determine quality scores on original p0
qsmat <- as.data.frame(matrix(nrow = (2*peaknum), ncol = n_qs))
colnames(qsmat) <- names(msd[[im]][[id]]$qs_list)
# # determine quality scores on original p0
# qsmat <- as.data.frame(matrix(nrow = (2*peaknum), ncol = n_qs))
# colnames(qsmat) <- names(msd[[im]][[id]]$qs_list)
# for (ip in 1:length(peaktops)) {
# qsmat[ip, ] <- get_quality_scores(p1, p2, p3, peakstart[ip], peaktops[ip], peakend[ip], nowusetrace, im, metab, prm, rangestart = tr[1])
# }
# alternative idea: get quality scores generated by peak_detection and
# find the consensus peak that represents the highest average quality score
originalrt <- unlist(sapply(msd[[im]], '[[', 'RT'))
originalqs <- unlist(sapply(msd[[im]], '[[', 'qs'))
metabshift <- 1/(prm$samplingfrequency * 60) * rtshift2[ ,im]
peakoldqs <- numeric(length(peaktops))
for (ip in 1:length(peaktops)) {
qsmat[ip, ] <- get_quality_scores(p1, p2, p3, peakstart[ip], peaktops[ip], peakend[ip], nowusetrace, im, metab, prm, rangestart = tr[1])
if (!is.na(peakstart[ip]) && !is.na(peakend[ip])) {
nowpstart <- prm$unirt[min(max(c(0, peakstart[ip] + tr[1])), (prm$nscan-10))]
nowpend <- prm$unirt[max(c(3, min(c(prm$nscan, peakend[ip] + tr[1])) )) ]
peakoldqs[ip] <- sum(originalqs[intersect(which((originalrt - metabshift) > nowpstart),
which((originalrt - metabshift) < nowpend))]) /
(peakend[ip] - peakstart[ip])
}
}
peakoldqs[is.na(peakoldqs)] <- 0
# Get best peak
if (prm$ml_type == 'mltrain_initial'){
# select best candidate by RF model
pred_qs <- rowMeans(qsmat)
bestcand <- which(pred_qs == max(pred_qs, na.rm = TRUE))[1]
}else{
# Transform input data
trans <- predict(prm$train_preprocessing, as.data.frame(qsmat) )
# select best candidate by RF model
pred_qs <- predict(prm$model, trans, type="prob")
bestcand <- which(pred_qs$H == max(pred_qs$H ))[1] # take first candidate if two bestcands are present
}
bestcand <- which(peakoldqs == max(peakoldqs))
bestcand <- bestcand[1]
# if (prm$ml_type == 'mltrain_initial'){
# # select best candidate by RF model
# pred_qs <- rowMeans(qsmat)
# bestcand <- which(pred_qs == max(pred_qs, na.rm = TRUE))[1]
# }else{
# # Transform input data
# trans <- predict(prm$train_preprocessing, as.data.frame(qsmat) )
# # select best candidate by RF model
# pred_qs <- predict(prm$model, trans, type="prob")
# bestcand <- which(pred_qs$H == max(pred_qs$H ))[1] # take first candidate if two bestcands are present
# }
pstart <- peakstart[bestcand]
pend <- peakend[bestcand]
......
......@@ -152,19 +152,21 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
# Replace non existing quantifiers with qual12c MRMs if present ------------------------------
metab <- quant_qual12c_replacement(metab, smpl, prm)
# Detect peaks (anchromet, groupmets) per sample | shiftgroup ------------------------------
msd <- peak_detection(metab, smpl, prm)
} # endif mltrain_prepare shortcut
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
# do not proceed in case of initial training
if(prm$ml_type == 'mltrain_initial'){
print ('qqq_auto_integrate complete.')
return(NULL)
}
} # endif mltrain_prepare shortcut
# Detect peaks (anchromet, groupmets) per sample | shiftgroup ------------------------------
msd <- peak_detection(metab, smpl, prm)
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
# do not proceed in case of initial training
if(prm$ml_type == 'mltrain_initial'){
print ('qqq_auto_integrate complete.')
return(NULL)
}
# subset peakdata (only non NA metabolites are returned)
goodmets <- get_goodmets(metab, msd, prm)
......
......@@ -3,7 +3,7 @@
#' This function trains the random forest models used for peak classification
#'
#' @param model_type pcand or finalp to train peak recognition model or peak output model
#' @param ml_type type of machine learning algorithm to be used. Options are rf or nnet or xgbtree
#' @param ml_type type of machine learning algorithm to be used. Options are rf or nnet or xgbtree or svm
#' @param base_folder system folder that contains training data in subfolders
#' @param data_sets vector of names of folders in base_folder that contain training data and training solutions
#' @param model_file_name name of model to be written do disc
......@@ -141,18 +141,19 @@ train_model <- function(model_type, ml_type = 'rf', base_folder = '.', data_sets
names(test_data)[ncol(test_data)] <- "y"
# Set the training control conditions ------------------------------------------------------------
fitControl <- caret::trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'all', # saves predictions for optimal tuning parameter
returnResamp = "all", # ?
classProbs = TRUE, # should class probabilities be returned
summaryFunction=caret::twoClassSummary, # results summary function
verboseIter=TRUE # training log
)
if (ml_type == 'rf') {
# Set the training control conditions ------------------------------------------------------------
fitControl <- caret::trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'all', # saves predictions for optimal tuning parameter
returnResamp = "all", # ?
classProbs = TRUE, # should class probabilities be returned
summaryFunction=caret::twoClassSummary, # results summary function
verboseIter=TRUE # training log
)
model = caret::train(y ~ .,
data=train_data,
method='rf', # 'xgbTree', 'rf', 'nnet'
......@@ -163,6 +164,17 @@ train_model <- function(model_type, ml_type = 'rf', base_folder = '.', data_sets
trControl = fitControl)
} else if (ml_type == 'nnet') {
# Set the training control conditions ------------------------------------------------------------
fitControl <- caret::trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'all', # saves predictions for optimal tuning parameter
returnResamp = "all", # ?
classProbs = TRUE, # should class probabilities be returned
summaryFunction=caret::twoClassSummary, # results summary function
verboseIter=TRUE # training log
)
model = caret::train(y ~ .,
data=train_data,
method='nnet',
......@@ -172,7 +184,39 @@ train_model <- function(model_type, ml_type = 'rf', base_folder = '.', data_sets
nodesize = 5, # 1 as an alternative
trControl = fitControl)
} else if (ml_type == 'svm') {
# Set the training control conditions ------------------------------------------------------------
fitControl <- caret::trainControl(
method = 'cv', # k-fold cross validation
number = 3, # number of folds
returnResamp = "all", # ?
classProbs = TRUE, # should class probabilities be returned
summaryFunction=caret::twoClassSummary, # results summary function
verboseIter=TRUE # training log
)
C <- c(0.1, 1, 10, 100)
degree <- c(1,2,3)
scale <- c(0.001, 0.01, 0.1, 1) # higher scales take forever to train
gr.poly <- expand.grid(C=C, degree=degree, scale=scale)
model = caret::train(y ~ .,
data=train_data,
method='svmPoly',
maxit = 500,
metric="ROC",
trControl = fitControl,
tuneGrid = gr.poly)
} else if (ml_type == 'xgbtree') {
# Set the training control conditions ------------------------------------------------------------
fitControl <- caret::trainControl(
method = 'cv', # k-fold cross validation
number = 3, # number of folds
returnResamp = "all", # ?
classProbs = TRUE, # should class probabilities be returned
summaryFunction=caret::twoClassSummary, # results summary function
verboseIter=TRUE # training log
)
model = caret::train(y ~ .,
data=train_data,
method='xgbTree',
......
......@@ -15,7 +15,7 @@ train_model(
\arguments{
\item{model_type}{pcand or finalp to train peak recognition model or peak output model}
\item{ml_type}{type of machine learning algorithm to be used. Options are rf or nnet or xgbtree}
\item{ml_type}{type of machine learning algorithm to be used. Options are rf or nnet or xgbtree or svm}
\item{base_folder}{system folder that contains training data in subfolders}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment