Dear Gitlab users, due to maintenance reasons, Gitlab will not be available on Thursday 30.09.2021 from 5:00 pm to approximately 5:30 pm.

Commit a10bb665 authored by Joerg Buescher's avatar Joerg Buescher
Browse files

adding quality scores gauss shape and quantifier ratio and dynamic handling of...

adding quality scores gauss shape and quantifier ratio and dynamic handling of quality scores for QS median values and expert mode plot labels
parent 425dbe56
......@@ -313,23 +313,21 @@ for (im in 1:prm$nmet) {
msd[[im]][[id]]$label1 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'QS_pcand = ', round(final_peak_class$H*100) )
msd[[im]][[id]]$label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'Cor:', round(nowqs$QS_cor123, 2),
' T/O:', round(nowqs$QS_T.o, 2),
' T/Bmax:', round(nowqs$QS_T.h, 2),
' T/Bmin:', round(nowqs$QS_T.l, 2),
' dRT:', round(nowqs$QS_dRT, 2),
' width:', round(nowqs$QS_w, 2),
' Cor 12c:', round(nowqs$QS_cor12, 2),
' Cor 13c:', round(nowqs$QS_cor13, 2),
' Cor 12c/13c: ', round(nowqs$QS_cor23, 2),
'\n',
' PInt: ', round(nowqs$QS_height, 2),
' relRT:' , round(nowqs$QS_relRT, 2),
' T/O 12c:' , round(nowqs$QS_T.o2, 2),
' T/O 13c:' , round(nowqs$QS_T.o3, 2),
' dRT_shift:' , round(nowqs$QS_dRTs, 2),
label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n')
qsnames <- names(nowqs)
nowcount <- 0
for (iq in 1:length(nowqs) ) {
nowcount <- nowcount + 1
label2 <- paste0( label2, substr(qsnames[iq], 4, nchar(qsnames[iq]) ) , ': ', as.character(round(nowqs[[qsnames[iq]]],2) ), ' ')
if ((nowcount > 7) && (iq < length(nowqs))) {
label2 <- paste0(label2, '\n')
nowcount <- 0
}
}
msd[[im]][[id]]$label2 <- paste0(label2,
'\n',
' rf_pcand:' , final_peak_class$H)
......
......@@ -3,70 +3,72 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
# ideas for future work:
# add QS: the fit quality (residual of the fit, same for all samples)
# alternatively: for each sample/metabolite: residual of shift fit
# add QS: average score of next-best peak candidate: the idea is that
# if there are multiple good candidates, more weight should be given to dRT,
# add QS: average score of next-best peak candidate: the idea is that
# if there are multiple good candidates, more weight should be given to dRT,
# however if there is only 1 good candidate we can be more relaxed concerning dRT
# add QS: Measure how well the ratio of quantifier and 12c qualifier matches the ratio from metdb
# names must start with "QS" in order for the training script to work properly
qs_list <- list( 'QS_cor123' = 0,
'QS_T.o' = 0,
'QS_T.h' = 0,
'QS_T.l' = 0,
'QS_dRT' = 0,
'QS_w' = 0,
'QS_cor12' = 0,
'QS_cor13' = 0,
'QS_cor23' = 0,
'QS_height' = 0,
'QS_relRT' = 0,
'QS_T.o' = 0,
'QS_T.h' = 0,
'QS_T.l' = 0,
'QS_dRT' = 0,
'QS_w' = 0,
'QS_cor12' = 0,
'QS_cor13' = 0,
'QS_cor23' = 0,
'QS_height' = 0,
'QS_relRT' = 0,
'QS_T.o2' = 0,
'QS_T.o3' = 0,
'QS_gauss' = 0,
'QS_1vs2' = 0 ,
'QS_dRTs' = 0)
# QS1: average correlation (quant, qual12c, qual13c)
nowcor <- suppressWarnings(cor(cbind(x1[peakstart:peakend],x2[peakstart:peakend],x3[peakstart:peakend]),use="everything") ) #use="complete.obs"
cor_value <- 0.5*(mean(c(nowcor[1, 2:3], nowcor[2,3]), na.rm = TRUE)+1)
qs_list$QS_cor123 <- check_qs_value(cor_value)
cor_value <- 0.5*(mean(c(nowcor[1, 2:3], nowcor[2,3]), na.rm = TRUE)+1)
qs_list$QS_cor123 <- check_qs_value(cor_value)
# QS2 ratio peak top to highest point outside of series
t_o_value <- max(c(x1[peaktop] / (x1[peaktop] + max(x1[c(1:(max(c(peakstart-1,1))),(min(c(peakend+1,length(x1)))):length(x1))], na.rm = TRUE)),0))
qs_list$QS_T.o <- check_qs_value(t_o_value)
t_o_value <- max(c(x1[peaktop] / (x1[peaktop] + max(x1[c(1:(max(c(peakstart-1,1))),(min(c(peakend+1,length(x1)))):length(x1))], na.rm = TRUE)),0))
qs_list$QS_T.o <- check_qs_value(t_o_value)
# QS3 ratio peak top/max(peakend,peakstart)
t_b_h_value <- ((2 * x1[peaktop])/ (x1[peaktop] + max(x1[peakstart],x1[peakend])) ) -1
qs_list$QS_T.h <- check_qs_value(t_b_h_value)
# QS4 ratio peak top/min(peakend,peakstart)
t_b_l_value <- ((2 * x1[peaktop])/ (x1[peaktop] + min(x1[peakstart],x1[peakend])) ) -1
qs_list$QS_T.l <- check_qs_value(t_b_l_value)
qs_list$QS_T.l <- check_qs_value(t_b_l_value)
# QS5 deviation from expected RT
# algo in mrm_integrate: max(c(1/(1 + (0.02*abs(peaktops[i] - metparam$peaktopa) / (1+metparam$peaktopadelta) ) )^2,0))
peaktopa <- round(metab[[im]]$RT * 60 * prm$samplingfrequency)
dRT_value <- (1/(1+abs(prm$unirt[peaktop + rangestart] - prm$unirt[peaktopa]) ))^2
qs_list$QS_dRT <- check_qs_value(dRT_value)
qs_list$QS_dRT <- check_qs_value(dRT_value)
# deviation from predicted RT
if (prm$ml_type != 'mltrain_initial') {
peaktopa <- round(metab[[im]]$RT * 60 * prm$samplingfrequency) + rtshift
peaktopa <- min(max(c(1, peaktopa + rangestart)) , prm$nscan)
dRT_value <- (1/(1+abs(prm$unirt[peaktop] - prm$unirt[peaktopa]) ))^2
qs_list$QS_dRTs <- check_qs_value(dRT_value)
qs_list$QS_dRTs <- check_qs_value(dRT_value)
}
# QS6 widthscore
if (peakstart != peakend){
widthscore <- 1/ (1+(prm$unirt[peakend] - prm$unirt[peakstart]))
}else{
widthscore <- 0
}
qs_list$QS_w <- check_qs_value(widthscore)
qs_list$QS_w <- check_qs_value(widthscore)
# QS7 | QS8 | QS9 average correlation (quant, qual12c, qual13c)
#Check if nowusetrace is 0
......@@ -74,39 +76,39 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values[["median_qs7"]]
cor_value <- prm$rf_median_values$QS_cor12
}
}else{
cor_value <- cor(x1[peakstart:peakend],x2[peakstart:peakend],use="everything") * 0.5 + 0.5
}
qs_list$QS_cor12 <- check_qs_value(cor_value)
qs_list$QS_cor12 <- check_qs_value(cor_value)
#Check if nowusetrace is 0
if (nowusetrace[1] == FALSE || nowusetrace[3] == FALSE) {
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values[["median_qs8"]]
cor_value <- prm$rf_median_values$QS_cor13
}
}else{
cor_value <- cor(x1[peakstart:peakend],x3[peakstart:peakend],use="everything") * 0.5 + 0.5
}
qs_list$QS_cor13 <- check_qs_value(cor_value)
qs_list$QS_cor13 <- check_qs_value(cor_value)
#Check if nowusetrace is 0
if (nowusetrace[2] == FALSE || nowusetrace[3] == FALSE) {
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values[["median_qs9"]]
cor_value <- prm$rf_median_values$QS_cor23
}
}else{
cor_value <- cor(x2[peakstart:peakend],x3[peakstart:peakend],use="everything") * 0.5 + 0.5
}
qs_list$QS_cor23 <- check_qs_value(cor_value)
qs_list$QS_cor23 <- check_qs_value(cor_value)
# QS10 Peak intensity
peak_int_value <- log10(x1[peaktop])
qs_list$QS_height <- check_qs_value(peak_int_value)
......@@ -114,34 +116,55 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
# QS11 #retention time relative to method length
RT_value <- prm$unirt[peaktop] / max(prm$unirt, na.rm = TRUE)
qs_list$QS_relRT <- check_qs_value(RT_value)
qs_list$QS_relRT <- check_qs_value(RT_value)
# QS12 | QS13 (12c & 13c qualifier): ratio peak top to highest point outside
if (is.null(x2) || nowusetrace[2] == FALSE || (peakstart == peakend)){
if(prm$ml_type == "mltrain_initial"){
qual_t_o_value <- 0
}else{
qual_t_o_value <- prm$rf_median_values[["median_qs12"]]
qual_t_o_value <- prm$rf_median_values$QS_T.o2
}
}else{
qual_t_o_value <- max((max(x2[peakstart:peakend],na.rm=TRUE) / ( (max(x2[peakstart:peakend],na.rm=TRUE) + max(c(x2[1:max(c(peakstart-1),1)],x2[min(c(peakend+1,length(x2))):length(x2)]),na.rm=TRUE)) )),0)
}
qs_list$QS_T.o2 <- check_qs_value(qual_t_o_value)
qs_list$QS_T.o2 <- check_qs_value(qual_t_o_value)
# gaussean shape of quantifier
nowoutsidepoints <- which(x1 < 0.5*x1[peaktop]) - peaktop
nowfwhm <- min(nowoutsidepoints[nowoutsidepoints > 0]) - max(nowoutsidepoints[nowoutsidepoints < 0])
qs_list$QS_gauss <- check_qs_value(max(c(0,cor(x1[peakstart:peakend], dnorm(peakstart:peakend,mean=(peaktop+1),sd=(0.5*nowfwhm)) )),na.rm=TRUE) )# correlation to gaussian peak
# ratio quant/12C_qual
# Check if nowusetrace is 0
if (nowusetrace[1] == FALSE || nowusetrace[2] == FALSE ||
is.na(metab[[im]]$quant$intens) || is.na(metab[[im]]$qual12c$intens) ) {
if(prm$ml_type == "mltrain_initial"){
ratio_value <- 0
}else{
ratio_value <- prm$rf_median_values$QS_1vs2
}
} else {
ratio_value <- (1 / (1 + abs(log(x1[peaktop] / x2[peaktop], 2) - log(metab[[im]]$quant$intens / metab[[im]]$qual12c$intens,2))^2 ) )^2
}
qs_list$QS_1vs2 <- check_qs_value(ratio_value)
if (is.null(x3) || nowusetrace[3] == FALSE || (peakstart == peakend)){
if(prm$ml_type == "mltrain_initial"){
qual_t_o_value <- 0
}else{
qual_t_o_value <- prm$rf_median_values[["median_qs13"]]
qual_t_o_value <- prm$rf_median_values$QS_T.o3
}
}else{
qual_t_o_value <- max((max(x3[peakstart:peakend],na.rm=TRUE) / ( (max(x3[peakstart:peakend],na.rm=TRUE) + max(c(x3[1:max(c(peakstart-1),1)],x3[min(c(peakend+1,length(x3))):length(x3)]),na.rm=TRUE)) )),0)
}
qs_list$QS_T.o3 <- check_qs_value(qual_t_o_value)
qs_list$QS_T.o3 <- check_qs_value(qual_t_o_value)
# return qs_list
qs_list
......
......@@ -16,21 +16,15 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
ptm <- proc.time()
pardefault <- par(no.readonly = TRUE)
options("scipen"=100, "digits"=4)
# originalwd <- getwd()
# Set working directory (in Testmode)
if (nowfolder == '') {
print('setting nowfolder')
nowfolder <- getwd()
}else{
if(dir.exists(nowfolder)){
# print('setting nowfolder')
# setwd(nowfolder)
}else{
if(!dir.exists(nowfolder)){
cat(paste0(nowfolder," directory doesn't exist!"))
return(NULL)
}
}
# Remove existing log file and create new one
......@@ -79,19 +73,29 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand') ) { #(!(prm$ml_train && (prm$ml_type == 'initial'))) {
load(file = prm$model_rf_path) # 1. model to detect best peak in MRM
prm$model_rf <- model_rf_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$rf_median_values <- model_rf_median_values
# only keep what we need
rm('model_rf_pcand', 'model_rf_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
if (file.exists(prm$model_rf_path)) {
load(file = prm$model_rf_path) # 1. model to detect best peak in MRM
prm$model_rf <- model_rf_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$rf_median_values <- model_rf_median_values
# only keep what we need
rm('model_rf_pcand', 'model_rf_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
} else {
cat(paste0(prm$model_rf_path,": model doesn't exist!"))
return(NULL)
}
}
if (prm$ml_type == 'mlprod') { #} (!prm$ml_train) {
load(file = prm$model_rf_final_path)# 2. model to evaluate if peak should be reported
prm$model_rf_final <- model_rf_finalp
prm$train_preprocessing_final <- train_preprocessing_finalp
# only keep what we need
rm('model_rf_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
if (file.exists(prm$model_rf_final_path)) {
load(file = prm$model_rf_final_path)# 2. model to evaluate if peak should be reported
prm$model_rf_final <- model_rf_finalp
prm$train_preprocessing_final <- train_preprocessing_finalp
# only keep what we need
rm('model_rf_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
} else {
cat(paste0(prm$model_rf_path,": model doesn't exist!"))
return(NULL)
}
}
# calculate unified RT scale
......
......@@ -78,7 +78,8 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
qs_list <- list()
peaks <- list()
peakstarts <- peakends <- peak_min_start <- peak_max_end <- rep(NA,prm$initial_candidates)
qs <- area1 <- area2 <- area13c <- height1 <- height2 <- height13c <- RT <- rep(NA,prm$initial_candidates)
# peakparameter adjustment------------------------------------------------------
for (i in 1:length(peaktops)){
......@@ -89,10 +90,8 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
# initialize peaklists
peak <- list()
napeak <- FALSE
peakstarts <- peakends <- peak_min_start <- peak_max_end <- rep(NA,prm$initial_candidates)
qs <- area1 <- area2 <- area13c <- height1 <- height2 <- height13c <- RT <- rep(NA,prm$initial_candidates)
# Calculate peakstart and peakend (if peaktop is NA set dummy value length(x1))
# Calculate peakstart and peakend (if peaktop is NA set dummy value length(x1))
if (is.na(peaktops[i])) {
peakstarts[i] <- peakends[i] <- length(x1)
napeak <- TRUE
......@@ -221,22 +220,20 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
peak$label1 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'QS = ', round(qs[i]*100) )
peak$label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'Cor:', round(nowqs$QS_cor123, 2),
' T/O:', round(nowqs$QS_T.o, 2),
' T/Bmax:', round(nowqs$QS_T.h, 2),
' T/Bmin:', round(nowqs$QS_T.l, 2),
' dRT:', round(nowqs$QS_dRT, 2),
' width:', round(nowqs$QS_w, 2),
' Cor 12c:', round(nowqs$QS_cor12, 2),
' Cor 13c:', round(nowqs$QS_cor13, 2),
' Cor 12c/13c: ', round(nowqs$QS_cor23, 2),
'\n',
' PInt: ', round(nowqs$QS_height, 2),
' relRT:' , round(nowqs$QS_relRT, 2),
' T/O 12c:' , round(nowqs$QS_T.o2, 2),
' T/O 13c:' , round(nowqs$QS_T.o3, 2) )
label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n')
qsnames <- names(nowqs)
nowcount <- 0
for (iq in 1:length(nowqs) ) {
nowcount <- nowcount + 1
label2 <- paste0( label2, substr(qsnames[iq], 4, nchar(qsnames[iq]) ), ': ', as.character(round(nowqs[[qsnames[iq]]],2) ), ' ')
if ((nowcount > 7) && (iq < length(nowqs))) {
label2 <- paste0(label2, '\n')
nowcount <- 0
}
}
peak$label2 <- label2
# fill peak dummy
peaks[[i]] <- peak
......
......@@ -14,12 +14,8 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
if (model_type == 'pcand') {
prm$ml_type <- "mltrain_initial"
} else if (model_type == 'finalp') {
if (!file.exists(prm$model_rf_path)) {
cat('\n R.data-file of 1. model is missing. Please define path to model in update_prm.xlsx')
stop()
} else {
prm$ml_type <- "mltrain_pcand"
}
# # We cannot check if the model exists here, because we don't have prm yet.
prm$ml_type <- "mltrain_pcand"
}
......@@ -68,12 +64,6 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
print(paste0("Current dataset: ",data_sets[data]))
# check if xlsx file with training solution exists, if not skip execution of dataset
xlsx_path <- paste0(base_folder, data_sets[data],'/manual_peakcheck.xlsx')
if (!file.exists(xlsx_path)) {
print(paste('Cannot find required training solution:',xlsx_path))
next
}
# check if tsv file with training data exists, if not call qqq_auto_integrate to generate it
tsv_path <- paste0(base_folder, data_sets[data],'/qslog.tsv')
if (!file.exists(tsv_path)) {
......@@ -82,6 +72,14 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
nowfolder <- paste0(base_folder, data_sets[data])
process_batch(nowfolder, parameters = prm)
}
# check if xlsx file with training solution exists, if not skip execution of dataset
xlsx_path <- paste0(base_folder, data_sets[data],'/manual_peakcheck.xlsx')
if (!file.exists(xlsx_path)) {
print(paste('Cannot find required training solution:',xlsx_path))
next
}
# Generate training data
if (data==1) {
training_data <- generate_Xy_data_peaks_final(xlsx_path,tsv_path)
......@@ -169,19 +167,14 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
assign(paste0("training_data","_",model_type), train_data)
model_objects <- c(paste0("model_rf","_",model_type),paste0("train_preprocessing","_",model_type),paste0("train_dummy_vars","_",model_type))
# Store median values of quality scores with qualifier information (--> uses as default values )
if(model_type == "pcand"){
# qs7 QS_cor12
# qs8 QS_cor13
# qs9 QS_cor23
# qs12 QS_T.o2
# qs13 QS_T.o3
median_qs7 <- median(x1_train_data$QS_cor12[x12_13c_stack[,1][trainRowNumbers1]=="TRUE"])
median_qs8 <- median(x1_train_data$QS_cor12[x12_13c_stack[,2][trainRowNumbers1]=="TRUE"])
median_qs9 <- median(x1_train_data$QS_cor23[(x12_13c_stack[,1][trainRowNumbers1]=="TRUE") & (x12_13c_stack[,2][trainRowNumbers1]=="TRUE")])
median_qs12 <- median(x1_train_data$QS_T.o2[x12_13c_stack[,1][trainRowNumbers1]=="TRUE"])
median_qs13 <- median(x1_train_data$QS_T.o3[x12_13c_stack[,2][trainRowNumbers1]=="TRUE"])
model_rf_median_values=list("median_qs7" = median_qs7,"median_qs8" = median_qs8,"median_qs9" = median_qs9,"median_qs12" = median_qs12,"median_qs13" = median_qs13)
qsnames <- names(x1_train_data)
model_rf_median_values <- list()
for (iq in 1:length(qsnames)) {
model_rf_median_values[[qsnames[iq]]] <- median( x1_train_data[[qsnames[iq] ]][x12_13c_stack[,1][trainRowNumbers1]==TRUE])
}
model_objects <- c(model_objects,"model_rf_median_values")
}
# Save random forest data to file ---------------------------------------------------------------------
......
......@@ -4,7 +4,7 @@
\alias{process_batch}
\title{process set of .mzML files}
\usage{
process_batch(nowfolder = "", parameters = list())
process_batch(nowfolder = "", parameters = list(), return_msd = FALSE)
}
\arguments{
\item{nowfolder}{folder that contains .mzML files, sample.info and optionally update_prm.R. Default is current working directory}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment