Dear Gitlab users, due to maintenance reasons, Gitlab will not be available on Thursday 30.09.2021 from 5:00 pm to approximately 5:30 pm.

Commit 4ccdc195 authored by joerg.buescher's avatar joerg.buescher
Browse files

more efficient training

parent bd0b3a7a
......@@ -341,6 +341,27 @@ if (prm$verbose >= 1) {
# write log of quality scores for training to tsv file
write.table(qslog, file = file.path(prm$batchdir, 'qslog.tsv'), sep = '\t', row.names = FALSE)
nowscore <- matrix(nrow = prm$nsmpl, ncol = prm$nmet, data = NA)
# use old scores if available and peaks match
if (file.exists(file.path(prm$batchdir, "manual_peakcheck.xlsx"))) {
oldcheck <- openxlsx::read.xlsx(file.path(prm$batchdir, "manual_peakcheck.xlsx"), colNames = FALSE)
for (id in 1:prm$nsmpl) {
spos <- which(oldcheck[ ,1] == smpl$filenames[id])
if (length(spos) == 1) {
for (im in 1:prm$nmet) {
mpos <- which(oldcheck[1, ] == metab[[im]]$name)
if (length(mpos) == 1) {
if ( (abs(as.numeric(oldcheck[spos, mpos+1]) - prm$unirt[pstartmat[id ,im]] ) < 0.1) &&
(abs(as.numeric(oldcheck[spos, mpos+2]) - prm$unirt[pendmat[id ,im]] ) < 0.1) ) {
nowscore[id,im] <- oldcheck[spos, mpos]
}
}
}
}
}
}
# write template for training solution to xlsx file
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, 'Sheet1')
......@@ -351,7 +372,7 @@ if (prm$verbose >= 1) {
openxlsx::writeData(wb, 'Sheet1', metab[[im]]$name, startCol = nowcol, startRow = 1, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', t(c('Score', 'Start', 'End')) , startCol = nowcol, startRow = 2, rowNames = FALSE, colNames = FALSE)
options("openxlsx.numFmt" = "0")
openxlsx::writeData(wb, 'Sheet1', rep(1,prm$nsmpl) , startCol = nowcol, startRow = 4, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', nowscore[prm$typeorder, im] , startCol = nowcol, startRow = 4, rowNames = FALSE, colNames = FALSE)
options("openxlsx.numFmt" = "0.00")
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pstartmat[prm$typeorder ,im]] , startCol = nowcol+1, startRow = 4, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pendmat[prm$typeorder ,im]] , startCol = nowcol+2, startRow = 4, rowNames = FALSE, colNames = FALSE)
......
......@@ -75,70 +75,84 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
sink(prm$log_con, append=TRUE) # output errors and messages to log file
}
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand') ) { #(!(prm$ml_train && (prm$ml_type == 'initial'))) {
if (file.exists(prm$model_path)) {
load(file = prm$model_path) # 1. model to detect best peak in MRM
prm$model <- model_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$median_values <- model_median_values
# only keep what we need
rm('model_pcand', 'model_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
} else {
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
}
}
if (prm$ml_type == 'mlprod') { #} (!prm$ml_train) {
if (file.exists(prm$model_final_path)) {
load(file = prm$model_final_path)# 2. model to evaluate if peak should be reported
prm$model_final <- model_finalp
prm$train_preprocessing_final <- train_preprocessing_finalp
# only keep what we need
rm('model_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
} else {
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
# shortcut for mltrain_prepare
troubleshootpath <- file.path(prm$batchdir, 'troubleshoot.RData')
if (prm$runninglocal && (prm$ml_type == 'mltrain_prepare') && file.exists(troubleshootpath)) {
load(file = troubleshootpath) # partially processed data from initial training
prm$log_con <- file(file.path(prm$batchdir, "R_messages.log"),open="a")
load(file = prm$model_path) # 1. model to detect best peak in MRM
prm$model <- model_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$median_values <- model_median_values
# update prm from argument parameters again
paranames <- names(parameters)
if (!is.null(paranames)) {
for (ip in 1:length(paranames)) {
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
}
# calculate unified RT scale
prm <- get_unirt(file.path(prm$batchdir, list.files(path = prm$batchdir, pattern='.mzML')[1]) , prm)
} else {
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand', 'mltrain_prepare') ) {
if (file.exists(prm$model_path)) {
load(file = prm$model_path) # 1. model to detect best peak in MRM
prm$model <- model_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$median_values <- model_median_values
# only keep what we need
rm('model_pcand', 'model_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
} else {
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
}
}
if (prm$ml_type == 'mlprod') {
if (file.exists(prm$model_final_path)) {
load(file = prm$model_final_path)# 2. model to evaluate if peak should be reported
prm$model_final <- model_finalp
prm$train_preprocessing_final <- train_preprocessing_finalp
# only keep what we need
rm('model_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
} else {
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
}
}
# Read mzML-files for unirt, chromdata and samplenames (from files or Rdata-file, if present)-----------
# if (!file.exists("mzML.rds")) {
smpl <- read_mzmlfiles( list.files(path = prm$batchdir, pattern='.mzML'), prm)
# saveRDS(smpl, "mzML.rds") # ! NEEDED for testing, because data structure has been changed!
# }else{
# smpl <- readRDS("mzML.rds")
# }
prm$nsmpl <- length(smpl$chroms)
# calculate unified RT scale
prm <- get_unirt(file.path(prm$batchdir, list.files(path = prm$batchdir, pattern='.mzML')[1]) , prm)
# get additional info from sample.info
smpl <- read_sampleinfo(smpl, prm)
prm$typeorder <- order(smpl$sorttype)
# Read mzML-files for unirt, chromdata and samplenames
smpl <- read_mzmlfiles( list.files(path = prm$batchdir, pattern='.mzML'), prm)
prm$nsmpl <- length(smpl$chroms)
# Read out metabolite db / set up quantifier, 13C/12C quantifiers ------------------------------
metab <- read_metabdb(prm)
prm$nmet <- length(metab)
# quick access elements from metab, e.g. name: as.character(unlist(sapply(metab,'[','name')))
# get additional info from sample.info
smpl <- read_sampleinfo(smpl, prm)
prm$typeorder <- order(smpl$sorttype)
# Replace non existing quantifiers with qual12c MRMs if present ------------------------------
metab <- quant_qual12c_replacement(metab, smpl, prm)
# Read out metabolite db / set up quantifier, 13C/12C quantifiers ------------------------------
metab <- read_metabdb(prm)
prm$nmet <- length(metab)
# quick access elements from metab, e.g. name: as.character(unlist(sapply(metab,'[','name')))
# Detect peaks (anchromet, groupmets) per sample | shiftgroup ------------------------------
msd <- peak_detection(metab, smpl, prm)
# Replace non existing quantifiers with qual12c MRMs if present ------------------------------
metab <- quant_qual12c_replacement(metab, smpl, prm)
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
# Detect peaks (anchromet, groupmets) per sample | shiftgroup ------------------------------
msd <- peak_detection(metab, smpl, prm)
# do not proceed in case of initial training
if(prm$ml_type == 'mltrain_initial'){
print ('qqq_auto_integrate complete.')
return(NULL)
}
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
# do not proceed in case of initial training
if(prm$ml_type == 'mltrain_initial'){
print ('qqq_auto_integrate complete.')
return(NULL)
}
} # endif mltrain_prepare shortcut
# subset peakdata (only non NA metabolites are returned)
goodmets <- get_goodmets(metab, msd, prm)
......
......@@ -299,10 +299,7 @@ train_model <- function(model_type, ml_type = 'rf', base_folder = '.', data_sets
# to enable second round of training
if (model_type=='pcand') {
prm <- list()
prm$ml_type <- "mltrain_pcand"
print(base_folder)
print(model_file_name_prefix)
print(model_file_name)
prm$ml_type <- "mltrain_prepare"
prm$model_path <- paste0(base_folder, model_file_name_prefix, ".Rdata")
for (data in 1:length(data_sets)) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment