Commit 36b3bb68 authored by daniel.eilertz's avatar daniel.eilertz
Browse files

Merge branch 'eilertz' into 'master'

Eilertz

See merge request !15
parents ecea2c65 786e42da
#' automRm: Taking LC-QQQ peak picking to the next level
#'
#' automRm processes LC-QQQ raw data in the open .mzML format to obtain a user-friendly output of signal intensities for every analyte and every sample in .xlsx format. \cr \cr
#' In addition to the .mzML files a matching list of target metabolites must be available. automRm then uses 2 random forest models to 1st. decide which chromatographic peak is the most likely to represent an analyte and 2nd to decide if the data is of sufficient quality to be given to a person with little metabolomics experience.
#' Both random forest models can easily be trained to meet one's LC-MS methods and quality standards.
#'
#'
#' @section automRm functions:
#' @param train_model Train first and/or second model for automated peak picking.
#' @param initialize_prm Load and assign global default varariables. Default variables can be overwritten by the parameters argument when calling process_batch or using edited update_prm.tsv files in working folder).
#' @param process_batch Automated analysis of provided .mzML files.
#'
#' @docType package
#' @name automRm
NULL
#> NULL
get_unirt <- function(samplefile, prm) {
# read first file to get RT prm$timerange
mz <- mzR::openMSfile(samplefile)
nowtic <- mzR::tic(mz)
prm$timerange[1] <- min(c(prm$timerange[1], nowtic[,1] ))
prm$timerange[2] <- max(c(prm$timerange[2], nowtic[,1] ))
# create unified rt vector with leading 0
prm$unirt <- seq(from=(prm$timerange[1]), to=(prm$timerange[2]), by=(1/(prm$samplingfrequency*60)) )
prm$nscan <- length(prm$unirt)
prm
}
\ No newline at end of file
}
......@@ -22,7 +22,7 @@ initialize_prm <- function() {
prm$verbose <- 1
# Check if local or on server ------------------------------
# Check if local or on server
if (as.character(Sys.info()[["user"]]) == 'rstudio'){
prm$runninglocal <- TRUE
prm$pathprefix <- '/home/rstudio/'
......@@ -45,8 +45,8 @@ initialize_prm <- function() {
prm$groupmetsearchwindow <- 90 # search window in seconds
prm$minNormArea <- 10000 # minimum median area of 13C ISTD to use for normalization
prm$minwidth <- 4 # minimum peak width in seconds
prm$qscutoff <- c(0.8,0.6) #0.72 #minimum quality score to report a peak [1] used for outliers [2] used for peaks within RT range
prm$quartile_range_factor <- 3 #factor to spread interquartile range for peak's RT outlier detection
prm$qscutoff <- c(0.8,0.6) # 0.72 - minimum quality score to report a peak [1] used for outliers [2] used for peaks within RT range
prm$quartile_range_factor <- 3 # factor to spread interquartile range for peak's RT outlier detection
prm$initial_candidates <- 5 # to be used in qqq_mrm_integrage
prm$timewindow <- 1.5 # time window for correlating
prm$tshiftcrude <- seq(-60,60,2) # steps for time shift test
......@@ -66,7 +66,7 @@ initialize_prm <- function() {
#?
prm$sum12c <- FALSE # sum 12c qual and quant area/height #
# Polarmets
# Polarmets as default
prm$polarmets <- TRUE
prm$sosomets <- FALSE
prm$excelfile <- paste0(prm$pathprefix, 'data/helperfiles/metabolite_databases/metabdb_really_polar.xlsx')
......
......@@ -36,17 +36,17 @@ peak_detection <- function(metab, smpl, prm) {
'label2' = character() # debugging label for plot
)
msd2 <- list()
for (id in 1:prm$nsmpl) {
for(id in 1:prm$nsmpl) {
msd2[[id]] <- msdelement
}
msd <- list()
for (im in 1:prm$nmet) {
for(im in 1:prm$nmet) {
msd[[im]] <- msd2
}
rm('msdelement', 'msd2')
# loop over samples
for (id in 1:prm$nsmpl){
for(id in 1:prm$nsmpl){
# id <- 1
if (prm$verbose >= 2 ){
cat(paste(as.character(id),'-'), file=prm$log_con)
......@@ -149,13 +149,252 @@ peak_detection <- function(metab, smpl, prm) {
# !-------
# GET PEAK CANDIDATES
peak_list <- qqq_mrm_integrate(metab, smpl, prm, msd, id, im)
#peak_list <- qqq_mrm_integrate(metab, smpl, prm, msd, id, im)
# !-------
# ToDo: move this for loop to end of qqq_mrm_integrate
# alternatively: get logic from qqq_mrm_integrate and place here
#
#
# START mrm_integrate
#
if(TRUE){
x1 <- msd[[im]][[id]]$x1
x2 <- msd[[im]][[id]]$x2
x3 <- msd[[im]][[id]]$x3
nowdata <- msd[[im]][[id]]$coimidata
nowusetrace <- as.logical( c( 1, c(metab[[im]]$qual12c$use, 0)[1] , c(metab[[im]]$qual13c$use,0)[1] ) )
nowusetrace[is.na(nowusetrace)] <- FALSE
# Calculate moving window correlation (over all available time series)
if(TRUE){
# can only calculate correlation and quant/qual ratio if there is more than 1 trace
if (prm$polarmets == FALSE) {
nowcorwindow <- 3*prm$corwindow # larger correlation window for lipids
} else {
nowcorwindow <- prm$corwindow
}
# moving window correlation over available x1,x2,x3
meancor <- rep(0,prm$nscan)
if (length(nowdata) > prm$nscan){
for (ic in (1:(prm$nscan-nowcorwindow) ) ) {
nowcor <- cor(nowdata[ic:(ic-1+nowcorwindow), ])
meancor[ic+ceiling(0.5*nowcorwindow)] <- mean(nowcor[lower.tri(nowcor)], na.rm = TRUE)
}
meancor[is.na(meancor)] <- 0
# more strict on correlation for lipids
if (prm$polarmets == FALSE) {
meancor[meancor < 0.75] <- 0
}
}
}
# calculate slope, maxima, minima ... of series
if(TRUE){
# normalize time series x1,x1 for plotting
xtest <- (x1 - min(x1,na.rm=TRUE))/ (max(x1,na.rm=TRUE)-min(x1,na.rm=TRUE))
# slope
x1slope <- diff(x1)
#local maxima
local_max <- which(diff(sign(diff(x1)))==-2)
#local minima
local_min <- which(diff(sign(diff(x1)))==2)
# top5 local maxima
peaktops <- local_max[order(x1[local_max], decreasing=T)][1:prm$initial_candidates]
# slope < 0
ls <- which(x1slope < 0) #blue
# slope > 0
us <- which(x1slope > 0) #green
}
# Test plot x1 with maxima/minima and slope
if ( prm$verbose >= 3 ){
par(mar=c(2,2,0.5,0.5))
layout(matrix(c(1,1,2,2), 2, 2, byrow = TRUE),
widths=c(3,1), heights=c(2,1))
plot(xtest)
for(i in 1:length(local_max)){
abline(v=local_max,col="red",lty="dotted")
}
for(i in 1:length(local_min)){
abline(v=local_min,col="blue",lty="dotted")
}
plot(x1slope)
points(ls,x1slope[ls],col="blue")
points(us,x1slope[us],col="red")
}
qs_list <- list()
peak_list <- list()
peakstarts <- peakends <- peak_min_start <- peak_max_end <- rep(NA,prm$initial_candidates)
qs <- area1 <- area2 <- area13c <- height1 <- height2 <- height13c <- RT <- rep(NA,prm$initial_candidates)
# peakparameter adjustment
for (i in 1:length(peaktops)){
#i <- 1
if (prm$verbose >=1 ) {
cat('\r', i)
}
# initialize peaklists
peak <- list()
napeak <- FALSE
# Calculate peakstart and peakend (if peaktop is NA set dummy value length(x1))
if (is.na(peaktops[i])) {
peakstarts[i] <- peakends[i] <- length(x1)
napeak <- TRUE
}else{
#PEAKSTART
# nearest local minimum left from peaktop
peak_min_start[i] <- max(c(local_min[sign(local_min - peaktops[i]) < 0],0))
# maximum range of bad slope cutoff left from peaktop
peak_start_width_thresh <- round(peaktops[i] - (peaktops[i] - peak_min_start[i])/3)
# bad slope indices between potential peakstart and peaktop
bad_slopes <- which(abs(x1slope[peak_min_start[i]:peak_start_width_thresh]) < max(abs(x1slope[peak_min_start[i]:peak_start_width_thresh]),na.rm=T)*0.1)
# x-shift of peakstart (0 if bad_slopes is empty)
shift <- max(c(0,bad_slopes))
peakstarts[i] <- max(c(1,peak_min_start[i] + shift))
#PEAKEND
# nearest local minimum right from peaktop
peak_max_end[i] <- min(c(local_min[sign(local_min - peaktops[i]) > 0],length(x1)))
# maximum range of bad slope cutoff right from peaktop
peak_end_width_thresh <- round(peak_max_end[i] - (peak_max_end[i] - peaktops[i])/3)
# bad slope indices between potential peakstart and peaktop
bad_slopes <- which(abs(x1slope[peak_max_end[i]:peak_end_width_thresh]) < max(abs(x1slope[peak_max_end[i]:peak_end_width_thresh]),na.rm=T)*0.1)
# x-shift of peakend (0 if bad_slopes is empty)
shift <- max(c(0,bad_slopes))
peakends[i] <- min(c(peak_max_end[i] - shift),length(x1))
}
# Plotting (gurumode)
if (!napeak & (prm$verbose >= 3 )){
# graphichal parameters
par(mar=c(2,2,0.5,0.5))
layout(matrix(c(1,1,2,2), 2, 2, byrow = TRUE),
widths=c(3,1), heights=c(2,1))
# x range around peak for plotting
peak_x_range <- (max(c(1,peak_min_start[i]-100))):min(c(prm$nscan,peak_max_end[i]+100))
# Plot peak MRM, start, top end
plot(peak_x_range,xtest[peak_x_range])#,xaxt='n')
header <- paste(i, " - " , metab[[im]]$name, "(",im,")")
text(peak_x_range[1]+20,max(xtest[peak_x_range],na.rm=T),labels=header)
lines(peak_x_range,xtest[peak_x_range])
points(us,xtest[us],col="green",pch=16)
points(ls,xtest[ls],col="blue",pch=16)
abline(v=peak_min_start[i],col=rainbow(length(peaktops))[i],lty=2)
abline(v=peakstarts[i],col="black",lty=3)
abline(v=peaktops[i],col=rainbow(length(peaktops))[i])
abline(v=peak_max_end[i],col=rainbow(length(peaktops))[i],lty=2)
abline(v=peakends[i],col="black",lty=3)
# plot peak slope
plot(peak_x_range,x1slope[peak_x_range], type="l")
abline(h=0)
points(ls[!is.na(match(ls,peak_x_range))],x1slope[ls[!is.na(match(ls,peak_x_range))]], type="p",col="blue")
points(us[!is.na(match(us,peak_x_range))],x1slope[us[!is.na(match(us,peak_x_range))]], type="p",col="green")
}
# Get area, heigth and RT
if(TRUE){
# safe peak-areas for met
area1[i] <- if(!napeak) sum(x1[peakstarts[i]:peakends[i]])/prm$samplingfrequency else NA
area2[i] <- if(!is.null(x2)) sum(x2[peakstarts[i]:peakends[i]])/prm$samplingfrequency else NA
area13c[i] <- if(!is.null(x3)) sum(x3[peakstarts[i]:peakends[i]])/prm$samplingfrequency else NA
height1[i] <- x1[peaktops[i]]
# save peak-heights for met
if (length(x2) == 0){
height2[i] <- 0
} else {
height2[i] <- x2[peaktops[i]]
}
if (length(x3) == 0){
height13c[i] <- 0
} else {
height13c[i] <- x3[peaktops[i]]
}
# Save peak retention time
RT[i] <- prm$unirt[peaktops[i]]
}
nowqs <- get_quality_scores(x1, x2, x3, peakstarts[i], peaktops[i], peakends[i], nowusetrace, im, metab, prm)
# Calculate peak quality scores
if (prm$ml_type == "mltrain_initial"){
# Calculate mean peak's qs (squared!)
qs[i] <- (mean(as.numeric(nowqs), na.rm = TRUE))^2
} else {
# Transform input data
trans <- predict(prm$train_preprocessing, as.data.frame(nowqs) ) # as.data.frame(t(nowqs))
# RF Model predictions
qs[i] <- predict(prm$model, trans,type="prob")$H
}
# add nowqs to peak parameters
qs_list[[i]] <- nowqs
# Check peaks for killcriteria
killcriteria <- c( napeak,
(abs(peakstarts[i]-peaktops[i]) <=prm$minDistTopBorder) ,
(abs(peakends[i]-peaktops[i]) <=prm$minDistTopBorder) ,
(x1[peakstarts[i]]>=x1[peaktops[i]]) ,
(x1[peakends[i]]>=x1[peaktops[i]]) ,
((peakends[i]-peakstarts[i]) < (prm$minwidth*prm$samplingfrequency)) )
killcriteria[ is.na(killcriteria)] <- FALSE
# discard peaks
if (any(killcriteria)) {
qs[i] <- 0
}
# Set up peak data structure for further procssing
peak$area1 <- area1[i]
peak$area2 <- area2[i]
peak$area13c <- area13c[i]
peak$height1 <- height1[i]
peak$height2 <- height2[i]
peak$height13c <- height13c[i]
peak$RT <- RT[i]
peak$qs <- peak$final_peak_class <- qs[i]
peak$qs_list <- qs_list[[i]]
peak$peakstart <- peakstarts[i]
peak$peaktop <- peaktops[i]
peak$peakend <- peakends[i]
peak$ylim <- c(0,1.1*max(c(100,height1[i],nowusetrace[2]*height2[i],nowusetrace[3]*height13c[i]),na.rm = TRUE))
peak$label1 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n',
'QS = ', round(qs[i]*100) )
label2 <- paste0(metab[[im]]$name, '\n',
smpl$samplenames[[id]], '\n')
qsnames <- names(nowqs)
nowcount <- 0
for (iq in 1:length(nowqs) ) {
nowcount <- nowcount + 1
label2 <- paste0( label2, substr(qsnames[iq], 4, nchar(qsnames[iq]) ), ': ', as.character(round(nowqs[[qsnames[iq]]],2) ), ' ')
if ((nowcount > 7) && (iq < length(nowqs))) {
label2 <- paste0(label2, '\n')
nowcount <- 0
}
}
peak$label2 <- label2
# fill peak dummy
peak_list[[i]] <- peak
} # endfor all peaktops
#return peaks
#peak_list
}
#
# END mrm_integrate
#
for (p in 1:length(peak_list)){
if (!is.list(peak_list[[p]]) ) {
candqs[p] <- -1
......
......@@ -11,14 +11,13 @@
#'
#' @export
process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE) {
# process_batch('~/data/helperfiles/testdata/LC_machine_learning/ccm2/' )
# Initialize analysis ------------------------------
#setwd("/home/rstudio/data/helperfiles/testdata/LC_machine_learning/4sample_am")
# Initialize analysis
ptm <- proc.time()
pardefault <- par(no.readonly = TRUE)
options("scipen"=100, "digits"=4)
# Set working directory (in Testmode)
# Set working directory (use current directory if not specified)
if (nowfolder == '') {
nowfolder <- getwd()
}else{
......@@ -33,55 +32,21 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
}
}
# Remove existing log file and create new one
#system(paste0("rm ", file.path(nowfolder, "R_messages.log")))
# Initialize prm (global settings)
prm <- initialize_prm()
# create new log file
prm$log_con <- file(file.path(nowfolder, "R_messages.log"),open="a")
# Update prm from tsv config file and arguments
prm <- update_prm(nowfolder,parameters,prm)
# update prm from local settings in mzML folder
update_prm_path <- file.path(nowfolder,"update_prm.tsv")
if (file.exists(update_prm_path)){
updatedata <- read.table(update_prm_path, sep = '\t', stringsAsFactors = FALSE)
colnames(updatedata) <- updatedata[1, ]
updatedata <- updatedata[-1, ]
if (('Variable in prm' %in% colnames(updatedata)) && ('Value' %in% colnames(updatedata)) ) {
print(paste0('Updating parameters from ', update_prm_path))
for (iv in 1:nrow(updatedata)) {
nowvalue <- updatedata[iv,'Value']
if (nowvalue %in% c("TRUE", "FALSE")) {
nowvalue <- as.logical(nowvalue)
} else if ( !is.na(as.numeric(nowvalue)) ) {
nowvalue <- as.numeric(nowvalue)
}
prm[[updatedata[iv,'Variable in prm']]] <- nowvalue
}
} else {
print(paste0('Error in ',update_prm_path ,' , cannot find required columns "Variabel in prm" and "Value".'))
}
# output errors and messages to log file if running on server
if(!prm$runninglocal){
sink(prm$log_con, append=TRUE)
}
# update prm from argument parameters
paranames <- names(parameters)
if (!is.null(paranames)) {
for (ip in 1:length(paranames)) {
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
# save nowfolder to prm
prm$batchdir <- nowfolder
if(!prm$runninglocal){ # if running on server
sink(prm$log_con, append=TRUE) # output errors and messages to log file
}
# shortcut for mltrain_prepare
# ??? shortcut for mltrain_prepare
troubleshootpath <- file.path(prm$batchdir, 'troubleshoot.RData')
if (prm$runninglocal && (prm$ml_type == 'mltrain_prepare') && file.exists(troubleshootpath)) {
load(file = troubleshootpath) # partially processed data from initial training
......@@ -92,7 +57,6 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
prm$batchdir <- nowfolder
prm$log_con <- file(file.path(prm$batchdir, "R_messages.log"),open="a")
load(file = prm$model_path) # 1. model to detect best peak in MRM
......@@ -138,7 +102,7 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
prm <- get_unirt(file.path(prm$batchdir, list.files(path = prm$batchdir, pattern='.mzML')[1]) , prm)
# Read mzML-files for unirt, chromdata and samplenames
smpl <- read_mzmlfiles( list.files(path = prm$batchdir, pattern='.mzML'), prm)
smpl <- read_mzmlfiles(list.files(path = prm$batchdir, pattern='.mzML'), prm)
prm$nsmpl <- length(smpl$chroms)
# get additional info from sample.info
......@@ -159,6 +123,7 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
# Detect peaks (anchromet, groupmets) per sample | shiftgroup ------------------------------
msd <- peak_detection(metab, smpl, prm)
# Save troubleshoot data eventually
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
......
......@@ -77,11 +77,11 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
}
qs_list <- list()
peaks <- list()
peak_list <- list()
peakstarts <- peakends <- peak_min_start <- peak_max_end <- rep(NA,prm$initial_candidates)
qs <- area1 <- area2 <- area13c <- height1 <- height2 <- height13c <- RT <- rep(NA,prm$initial_candidates)
# peakparameter adjustment------------------------------------------------------
# peakparameter adjustment
for (i in 1:length(peaktops)){
#i <- 1
if (prm$verbose >=1 ) {
......@@ -236,9 +236,9 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
peak$label2 <- label2
# fill peak dummy
peaks[[i]] <- peak
peak_list[[i]] <- peak
} # endfor all peaktops
#return peaks
peaks
peak_list
} # end function
update_prm <- function(nowfolder,parameters,prm){
# update prm from local settings in nowfolder
update_prm_path <- file.path(nowfolder,"update_prm.tsv")
if (file.exists(update_prm_path)){
updatedata <- read.table(update_prm_path, sep = '\t', stringsAsFactors = FALSE)
colnames(updatedata) <- updatedata[1, ]
updatedata <- updatedata[-1, ]
if (('Variable in prm' %in% colnames(updatedata)) && ('Value' %in% colnames(updatedata)) ) {
print(paste0('Updating parameters from ', update_prm_path))
for (iv in 1:nrow(updatedata)) {
nowvalue <- updatedata[iv,'Value']
if (nowvalue %in% c("TRUE", "FALSE")) {
nowvalue <- as.logical(nowvalue)
} else if ( !is.na(as.numeric(nowvalue)) ) {
nowvalue <- as.numeric(nowvalue)
}
prm[[updatedata[iv,'Variable in prm']]] <- nowvalue
}
} else {
print(paste0('Error in ',update_prm_path ,' , cannot find required columns "Variabel in prm" and "Value".'))
}
}
# update prm from argument parameters
paranames <- names(parameters)
if (!is.null(paranames)) {
for (ip in 1:length(paranames)) {
#ip = 1
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
# save nowfolder to prm
prm$batchdir <- nowfolder
prm
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment