Commit a89c8b94 authored by daniel.eilertz's avatar daniel.eilertz
Browse files

Merge branch 'dev' of gitlab.gwdg.de:joerg.buescher/automrm into eilertz

parents f651163a 0f2d719d
No preview for this file type
# Generated by roxygen2: do not edit by hand
export(initialize_prm)
export(qqq_auto_integrate)
export(qqq_model_train)
export(process_batch)
export(train_model)
......@@ -62,7 +62,7 @@ for (id in 1:prm$nsmpl) {
# fine-adjust rt shift only for really polar
if (prm$polarmets && !prm$sosomets){
if (prm$verbose >=2 ){
pdf(file = "shiftplots.pdf", height = 6, width = 6, family = "Helvetica")
pdf(file = file.path(prm$batchdir, "shiftplots.pdf"), height = 6, width = 6, family = "Helvetica")
}
fitqual <- numeric()
for (id in 1:prm$nsmpl) {
......@@ -340,7 +340,7 @@ for (im in 1:prm$nmet) {
if (prm$verbose >= 1) {
# write log of quality scores for training to tsv file
write.table(qslog, file = 'qslog.tsv', sep = '\t', row.names = FALSE)
write.table(qslog, file = file.path(prm$batchdir, 'qslog.tsv'), sep = '\t', row.names = FALSE)
# write template for training solution to xlsx file
wb <- openxlsx::createWorkbook()
......@@ -357,7 +357,7 @@ if (prm$verbose >= 1) {
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pstartmat[ ,im]] , startCol = nowcol+1, startRow = 4, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pendmat[ ,im]] , startCol = nowcol+2, startRow = 4, rowNames = FALSE, colNames = FALSE)
}
openxlsx::saveWorkbook(wb, "manual_peakcheck_template.xlsx", overwrite = TRUE)
openxlsx::saveWorkbook(wb, file.path(prm$batchdir, "manual_peakcheck_template.xlsx"), overwrite = TRUE)
}
msd # return msd
......
generate_Xy_data_peaks_final <- function(xlsx_path,tsv_path){
#xlsx with classification
xlsx_td <- openxlsx::read.xlsx(xlsx_path, colNames = FALSE, rowNames = FALSE)
#tsv with quality scores
tsv_td <- read.table(file = tsv_path, sep = '\t', stringsAsFactors = FALSE)
colnames(tsv_td) <- as.character(tsv_td[1, ])
tsv_td <- as.data.frame(tsv_td[-1, ])
tsvqscols <- grep('QS_', colnames(tsv_td))
pcandcol <- which(colnames(tsv_td) == 'Output_H')
tsvnumcol <- c(tsvqscols, pcandcol)
for (ic in tsvnumcol) {
tsv_td[ , ic] <- as.numeric(tsv_td[ , ic])
}
#initialize Xy_data
my_data <- as.data.frame(matrix(rep(NA,(ncol(xlsx_td)-2)*(nrow(xlsx_td)-3)*(ncol(tsv_td)+4)),ncol=ncol(tsv_td)+4))
colnames(my_data) <- c("y", colnames(tsv_td)[-c(1,2)], paste0("RF",seq(0,100,25)) )
metnames <- unique(as.character(xlsx_td[1,]))
metnames <- metnames[!is.na(metnames)]
# parse y from xlsx data
count <- 1
for (col in 3:ncol(xlsx_td)) {
for (im in 1:length(metnames)) {
# col=3
met <- xlsx_td[1,col]
for(row in 4:nrow(xlsx_td)){
sample <- substr(xlsx_td[row,1],1,nchar(xlsx_td[row,1])-5)
rownames(my_data)[count] <- paste0(met," ",sample)
col <- which(xlsx_td[1, ] == metnames[im] )[1]
for (row in 3:nrow(xlsx_td)) {
if (nchar(xlsx_td[row,1]) > 6) {
sample <- substr(xlsx_td[row,1], 1, nchar(xlsx_td[row,1])-5)
}
rownames(my_data)[count] <- paste0(metnames[im]," ",sample)
my_data[count,1] <- xlsx_td[row,col]
count <- count + 1
}
}
# parse quality scores from tsv file
# parse quality scores from tsv file
for (row in 1 : nrow(tsv_td)){
#row <- 4
sample_metab <- paste0(tsv_td[row,2]," ", tsv_td[row,1])
# Get index of sample*metabolites and scrape qs
row_index <- which(rownames(my_data) == sample_metab)
qs_scores<- tsv_td[row, tsvqscols ] # 5:ncol(tsv_td)-1]
# get indices of current metab to aggregate rf hit probabilty
metab_indices <- which(tsv_td[,2] == tsv_td[row,2])
rf_quantiles <- quantile(tsv_td[metab_indices, pcandcol])
my_data[row_index,2:ncol(my_data)] <- c(qs_scores, rf_quantiles)
}
# Only use subset of mamatrix where peak classification is 2 or 0 (1's are maybe peaks)
#my_data <- my_data[my_data[,1] != 1,]
#my_data[my_data[,1] == 1,1] <- 0
# Set y to factor
# Set y to factor
my_data$y <- as.factor(my_data$y)
#Return Xy_data
my_data
}
\ No newline at end of file
}
......@@ -31,9 +31,6 @@ initialize_prm <- function() {
prm$pathprefix <- '/'
}
# Set up logging (log to file only when running on server) ------------------------------
prm$log_con <- file("R_messages.log",open="a")
# Set global parameters for peak detection
prm$timerange <- c(0,5) # time range in minutes --> will be re-determined based on first sample further down
prm$samplingfrequency <- 2 # samplingfrequency (time resolution of analysis) in Hz
......
......@@ -8,8 +8,8 @@ peak_detection <- function(metab, smpl, prm) {
cat('Now integrating peaks in from all files. \n Counting files...', file=prm$log_con)
print(' ')
qslog_i <- 1
# initialize varibiables
# initialize varibiables
# outer list: metabolites
# inner lists: samples
msdelement <- list('x1' = rep(0,prm$nscan) , # quantifier chromatogram
......@@ -52,7 +52,7 @@ peak_detection <- function(metab, smpl, prm) {
cat(paste(as.character(id),'-'), file=prm$log_con)
print(paste0(" Peak detection of file: ", id, ": ", smpl$samplenames[id], ' '))
}
#loop over metabolites (quantifier ids)
for(im in 1:prm$nmet){
# im <- 1
......@@ -62,7 +62,7 @@ peak_detection <- function(metab, smpl, prm) {
if (prm$verbose > 0) {
cat('\r', paste(" Working on: ", metab[[im]]$name ," - ","ID: ",im, ' '))
}
# Set default values and continue if metabolite WAS NOT MEASURED in sample
if (!metab[[im]]$measured) {
if (prm$verbose >= 3) {
......@@ -77,17 +77,17 @@ peak_detection <- function(metab, smpl, prm) {
msd[[im]][[id]]$height13 <- 0
msd[[im]][[id]]$RT <- NA
msd[[im]][[id]]$qs <- -1
next
next
} # not measured
# start new metabolite parameter set
candidates <- list()
candqs <- numeric()
candrt <- numeric()
anypeak <- FALSE
x1 <- x2 <- x3 <- rep(0,prm$nscan)
# Get rollmean of quantifier
if (length(smpl$chroms[[id]][[ metab[[im]]$quant$MRM ]] ) > prm$smoothnum ) {
nonanpos <- !is.na(smpl$chroms[[id]][[metab[[im]]$quant$MRM]])
......@@ -100,7 +100,7 @@ peak_detection <- function(metab, smpl, prm) {
}
}
nowdata <- x1
# Get rollmean 12c qualifiers
if (length(smpl$chroms[[id]][[ metab[[im]]$qual12c$MRM[1] ]] ) > prm$smoothnum ) {
nonanpos <- !is.na(smpl$chroms[[id]][[metab[[im]]$qual12c$MRM[1]]])
......@@ -111,7 +111,7 @@ peak_detection <- function(metab, smpl, prm) {
nowdata <- cbind(nowdata,x2)
}
} else {
if (msd[[im]][[id]]$foundchrom[2]) {
print(paste('12C qualifier not found for', metab[[im]]$name, 'in', smpl$samplenames[id], ' '))
} else {
......@@ -120,7 +120,7 @@ peak_detection <- function(metab, smpl, prm) {
}
}
}
# Get 13c qualifier
if (length(smpl$chroms[[id]][[ metab[[im]]$qual13c$MRM[1] ]] ) > prm$smoothnum ) {
nonanpos <- !is.na(smpl$chroms[[id]][[metab[[im]]$qual13c$MRM[1]]])
......@@ -131,7 +131,7 @@ peak_detection <- function(metab, smpl, prm) {
nowdata <- cbind(nowdata,x3)
}
} else {
if (msd[[im]][[id]]$foundchrom[3]) {
print(paste('13C qualifier not found for', metab[[im]]$name, 'in', smpl$samplenames[id], ' '))
} else {
......@@ -146,31 +146,31 @@ peak_detection <- function(metab, smpl, prm) {
msd[[im]][[id]]$x2 <- x2
msd[[im]][[id]]$x3 <- x3
msd[[im]][[id]]$combidata <- nowdata
# !-------
# GET PEAK CANDIDATES
# GET PEAK CANDIDATES
peak_list <- qqq_mrm_integrate(metab, smpl, prm, msd, id, im)
# !-------
# ToDo: move this for loop to end of qqq_mrm_integrate
# alternatively: get logic from qqq_mrm_integrate and place here
#
for (p in 1:length(peak_list)){
if (!is.list(peak_list[[p]]) ) {
candqs[p] <- -1
candrt[p] <- -1
next
}
}
# To check: maybe no need to write peak_list to msd[[im]][[id]]$peaks
candqs[p] <- peak_list[[p]]$qs
candrt[p] <- peak_list[[p]]$RT
anypeak <- TRUE
} # endfor peaklist
# Check peak candidates qs scores against treshold
if (prm$verbos >= 2){
cat('\r', paste(metab[[im]]$name , " --> Peaks' QS:", paste(sapply(candqs,round,2),collapse=" ")))
......@@ -193,16 +193,16 @@ peak_detection <- function(metab, smpl, prm) {
msd[[im]][[id]]$RT <- NA
msd[[im]][[id]]$qs <- -1
msd[[im]][[id]]$qs <- numeric(length(peak_list$qs_list))
next
}
# POLARMETS
# for polar mets take best peak or if there is no convincing peak
if ( (prm$polarmets) || (length(nowgoodpeak) == 0) ) {
# Fill matrices of area, height, RT, qs
if ( (prm$polarmets) || (length(nowgoodpeak) == 0) ) {
# Fill matrices of area, height, RT, qs
bestpeak <- which(candqs == max(candqs))[1]
# add bestpeak to metabolite-sample-data
msd[[im]][[id]]$bestpeak <- bestpeak # for_cleanup: msd[[im]][[id]]$bestoverallpeak <-
if (prm$sum12c) {
......@@ -224,9 +224,9 @@ peak_detection <- function(metab, smpl, prm) {
msd[[im]][[id]]$ylim <- peak_list[[bestpeak]]$ylim
msd[[im]][[id]]$label1 <- peak_list[[bestpeak]]$label1
msd[[im]][[id]]$label2 <- peak_list[[bestpeak]]$label2
qsnames <- names( peak_list[[bestpeak]]$qs_list )
# append qslog info for training (last 2 cols are x12_13c_flags for qs median values)
for(p in 1:length(peak_list)){
metab_name <- metab[[im]]$name
......@@ -235,7 +235,7 @@ peak_detection <- function(metab, smpl, prm) {
peak_RT <- peak_list[[p]]$RT
qs_scores <- peak_list[[p]]$qs_list
x12_13c_flags <- msd[[im]][[id]]$foundchrom[2:3]
if (qslog_i == 1) {
qslog <- c(metab_name,sample_name,peak_id,peak_RT,as.numeric(qs_scores),x12_13c_flags)
} else {
......@@ -243,24 +243,24 @@ peak_detection <- function(metab, smpl, prm) {
}
qslog_i <- qslog_i + 1
}
# Write training data to tsv (quality scores for 1. model)
cat('\r', paste(' --> found Metabolite at RT: ', as.character(round(peak_list[[bestpeak]]$RT,2)) ) )
} # -> else for lipids comes here <-
} # loop over metabolites
}# loop over samples
colnames(qslog) = c("Metabolite", "id", "Peak", "RT", qsnames , "12C", "13C")
if (prm$verbose >= 1) {
# tsv is saver in case of comma in metabolite or sample name, like fructore 1,6 bisphosphate
write.table(qslog, file = 'qslog_initial.tsv', sep = '\t', row.names = FALSE)
write.table(qslog, file = file.path(prm$batchdir, 'qslog_initial.tsv'), sep = '\t', row.names = FALSE)
}
msd
}# function end
plot_peaks <- function(metab, smpl, msd, prm) {
# plots of MRMs (quantifier, 12c&13c-qualifier) (n: samples x metabolites)
# plots of MRMs (quantifier, 12c&13c-qualifier) (n: samples x metabolites)
# plots peakarea, RT(expected and measured)
# adds quality scores to labels (more qs in expertmode)
cat('Creating peakoverview.pdf ', file=prm$log_con)
print(' ')
if(prm$train_path != ""){
pdf_file <- paste0("peakoverview_", prm$train_path, ".pdf")
}else{
pdf_file <- "peakoverview.pdf"
}
pdf(file = pdf_file, height = 2*prm$nsmpl, width = 5* prm$nmet, family = "Helvetica")
pdf(file = file.path(prm$batchdir, pdf_file), height = 2*prm$nsmpl, width = 5* prm$nmet, family = "Helvetica")
par(mai = c(0.5, 0.5, 0.8, 0.5))
layout(matrix(c(1:(prm$nmet*prm$nsmpl)), prm$nsmpl, prm$nmet, byrow = TRUE)) # initiate subplots
# Reverse looping and indexing according to Jörg plotting logic (samples first, then metabolites)
for (id in 1:prm$nsmpl){
for (im in 1:prm$nmet){
......@@ -25,11 +25,11 @@ plot_peaks <- function(metab, smpl, msd, prm) {
nd <- get_ref_chromatograms(metab, msd, smpl, id, id, im)
nd$x1 <- nd$x1 - prm$bgoffset
nd$x1[nd$x1 < 0] <- 0
nowy <- compressplot(nd$x1)
nowgoody <- !is.na(nowy)
textxpos <- min(prm$unirt[nowgoody], na.rm = TRUE)
if (is.na(msd[[im]][[id]]$peakstart ) || is.na(msd[[im]][[id]]$peakend) ) {
nowgoodplot <- FALSE
msd[[im]][[id]]$ylim <- 1.1 * max(c(nd$x1, nd$x2, nd$x3), na.rm = TRUE)
......@@ -37,33 +37,33 @@ plot_peaks <- function(metab, smpl, msd, prm) {
cat('\r', paste('No peak to plot for', metab[[im]]$name, 'in', smpl$samplenames[id],'. '))
}
}
if (sum(nowgoody) < 2) {
nowgoodplot <- FALSE
plot(0)
} else {
if (msd[[im]][[id]]$report) {
if (msd[[im]][[id]]$report) {
main_col <- "black"
} else {
} else {
main_col <- "red"
}
if (prm$expertmode) {
plot(prm$unirt[nowgoody], nowy[nowgoody] ,
type = 'l',
type = 'l',
main = msd[[im]][[id]]$label2, col.main = main_col,
ylim = msd[[im]][[id]]$ylim,
xlab='', ylab='',
xlab='', ylab='',
cex.main=0.7)
} else {
plot(prm$unirt[nowgoody], nowy[nowgoody] ,
type = 'l',
type = 'l',
main = msd[[im]][[id]]$label1, col.main = main_col,
ylim = msd[[im]][[id]]$ylim,
xlab='', ylab='',
xlab='', ylab='',
cex.main=0.7)
}
} # endif there is data to plot
if (nowgoodplot) {
# mark inegrated range
nowxa <- msd[[im]][[id]]$peakstart : msd[[im]][[id]]$peakend
......@@ -81,14 +81,14 @@ plot_peaks <- function(metab, smpl, msd, prm) {
polygon(x=c(prm$unirt[nowxa[1]], prm$unirt[nowxa], prm$unirt[nowxa[length(nowxa)]]) ,
y=c(msd[[im]][[id]]$ylim[2] , nowya, msd[[im]][[id]]$ylim[2]) ,
col='#DDDDDD', border=NA)
# draw quantifier line again because it looks nicer
lines(prm$unirt[nowgoody],nowy[nowgoody],lwd=1.2)
}
# legend for quantifier
text(textxpos,0.95*msd[[im]][[id]]$ylim[2], metab[[im]]$quant$MRM , cex=0.5, pos=4)
# draw vertical line at peaktop
lines(prm$unirt[rep(msd[[im]][[id]]$peaktop,2)],
msd[[im]][[id]]$ylim,
......@@ -98,14 +98,14 @@ plot_peaks <- function(metab, smpl, msd, prm) {
lines(rep( metab[[im]]$RT + (msd[[im]][[id]]$predshift / (prm$samplingfrequency * 60)) , 2),
c(0,-100000),
lwd=5, col="blue")
# draw vertical line at exp RT
lines(rep(metab[[im]]$RT ,2),
c(0,-100000),
lwd=3, col="red")
}
# plot 12C qualifier
if (msd[[im]][[id]]$foundchrom[2]) {
nd$x2 <- nd$x2 - prm$bgoffset
......@@ -119,7 +119,7 @@ plot_peaks <- function(metab, smpl, msd, prm) {
}
text(textxpos,0.85*msd[[im]][[id]]$ylim[2], metab[[im]]$qual12c$MRM , col = 'blue', cex=0.5, pos=4)
}
# plot 13c qualifier
if (msd[[im]][[id]]$foundchrom[3]) {
nd$x3 <- nd$x3 - prm$bgoffset
......@@ -133,7 +133,7 @@ plot_peaks <- function(metab, smpl, msd, prm) {
}
text(textxpos,0.75*msd[[im]][[id]]$ylim[2], metab[[im]]$qual13c$MRM , col = 'green', cex=0.5, pos=4)
}
}
}
dev.off()
......
......@@ -2,18 +2,21 @@
#'
#' This function processes all .mzML files in nowfolder (current folder if empty)
#' using metabolite definitions and sample information provided in the same folder.
#' Output files are written to nowfolder.
#'
#' @param nowfolder folder that contains .mzML files, sample.info and optionally update_prm.R. Default is current working directory
#' @param parameters list of parameters that overwrites defaults defined in initialize_prm()
#'
#' @return msd list metabolites of samples with processed data
#'
#' @export
qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
process_batch <- function(nowfolder = "", parameters = list()) {
# Initialize analysis ------------------------------
ptm <- proc.time()
pardefault <- par(no.readonly = TRUE)
options("scipen"=100, "digits"=4)
originalwd <- getwd()
# originalwd <- getwd()
# Set working directory (in Testmode)
if (nowfolder == '') {
......@@ -21,8 +24,8 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
nowfolder <- getwd()
}else{
if(dir.exists(nowfolder)){
print('setting nowfolder2')
setwd(nowfolder)
# print('setting nowfolder')
# setwd(nowfolder)
}else{
cat(paste0(nowfolder," directory doesn't exist!"))
return(NULL)
......@@ -30,21 +33,32 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
}
# Remove existing log file
system("rm R_messages.log")
# Remove existing log file and create new one
system(paste0("rm ", file.path(nowfolder, "R_messages.log")))
# Initialize prm (global settings)
prm <- initialize_prm()
print('initialize complete')
# create new log file
prm$log_con <- file(file.path(nowfolder, "R_messages.log"),open="a")
# update prm from local settings in mzML folder
update_prm_path <- paste0(nowfolder,"/update_prm.xlsx")
update_prm_path <- file.path(nowfolder,"update_prm.xlsx")
if (file.exists(update_prm_path)){
updatedata <- openxlsx::read.xlsx(update_prm_path)
if (('Variable.in.prm' %in% colnames(updatedata)) && ('Value' %in% colnames(updatedata)) ) {
for (iv in 1:nrow(updatedata)) {
prm[[updatedata[iv,'Variable.in.prm']]] <- updatedata[iv,'Value']
nowvalue <- updatedata[iv,'Value']
if (nowvalue %in% c("TRUE", "FALSE")) {
nowvalue <- as.logical(nowvalue)
} else if ( !is.na(as.numeric(nowvalue)) ) {
nowvalue <- as.numeric(nowvalue)
}
prm[[updatedata[iv,'Variable.in.prm']]] <- nowvalue
}
} else {
print('Error in update_prm.xlsx, cannot find required columns "Variabel in prm" and "Value".')
}
}
......@@ -55,42 +69,40 @@ print('initialize complete')
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
print('prm update complete')
# save nowfolder to prm
prm$batchdir <- nowfolder
if(!prm$runninglocal){ # if running on server
sink(prm$log_con, append=TRUE) # output errors and messages to log file
}
# Log git info ------------------------------
tryCatch({
cat(paste(timestamp(), '\n Now running process_batch.R',
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
scriptgithash <- automrmgithash <- helperfilesgithash <- ''
if (dir.exists('~/data/helperfiles')) {
setwd('~/data/helperfiles')
helperfilesgithash <- system("git describe --always", intern=TRUE)
helperfilesgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/helperfiles/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash), file=prm$log_con)
}
if (dir.exists('~/data/automRm')) {
setwd('~/data/automRm')
automrmgithash <- system("git describe --always", intern=TRUE)
automrmgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/automRm/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
), file=prm$log_con)
}
if (dir.exists('~/code/R_analysis')) {
setwd('~/code/R_analysis')
scriptgithash <- system("git describe --always", intern=TRUE)
scriptgithash <- system(paste0("git --git-dir=", prm$pathprefix, "code/R_analysis/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash), file=prm$log_con)
}
cat(paste(timestamp(), '\n Now running QQQ_integrate.R
\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash,
'\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash,
'\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
rm('helperfilesgithash', 'scriptgithash', 'automrmgithash')
}, error = function(err) {
# not in docker container used for development, do nothing