Commit a89c8b94 authored by daniel.eilertz's avatar daniel.eilertz
Browse files

Merge branch 'dev' of gitlab.gwdg.de:joerg.buescher/automrm into eilertz

parents f651163a 0f2d719d
No preview for this file type
# Generated by roxygen2: do not edit by hand
export(initialize_prm)
export(qqq_auto_integrate)
export(qqq_model_train)
export(process_batch)
export(train_model)
......@@ -62,7 +62,7 @@ for (id in 1:prm$nsmpl) {
# fine-adjust rt shift only for really polar
if (prm$polarmets && !prm$sosomets){
if (prm$verbose >=2 ){
pdf(file = "shiftplots.pdf", height = 6, width = 6, family = "Helvetica")
pdf(file = file.path(prm$batchdir, "shiftplots.pdf"), height = 6, width = 6, family = "Helvetica")
}
fitqual <- numeric()
for (id in 1:prm$nsmpl) {
......@@ -340,7 +340,7 @@ for (im in 1:prm$nmet) {
if (prm$verbose >= 1) {
# write log of quality scores for training to tsv file
write.table(qslog, file = 'qslog.tsv', sep = '\t', row.names = FALSE)
write.table(qslog, file = file.path(prm$batchdir, 'qslog.tsv'), sep = '\t', row.names = FALSE)
# write template for training solution to xlsx file
wb <- openxlsx::createWorkbook()
......@@ -357,7 +357,7 @@ if (prm$verbose >= 1) {
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pstartmat[ ,im]] , startCol = nowcol+1, startRow = 4, rowNames = FALSE, colNames = FALSE)
openxlsx::writeData(wb, 'Sheet1', prm$unirt[pendmat[ ,im]] , startCol = nowcol+2, startRow = 4, rowNames = FALSE, colNames = FALSE)
}
openxlsx::saveWorkbook(wb, "manual_peakcheck_template.xlsx", overwrite = TRUE)
openxlsx::saveWorkbook(wb, file.path(prm$batchdir, "manual_peakcheck_template.xlsx"), overwrite = TRUE)
}
msd # return msd
......
......@@ -19,15 +19,19 @@ generate_Xy_data_peaks_final <- function(xlsx_path,tsv_path){
my_data <- as.data.frame(matrix(rep(NA,(ncol(xlsx_td)-2)*(nrow(xlsx_td)-3)*(ncol(tsv_td)+4)),ncol=ncol(tsv_td)+4))
colnames(my_data) <- c("y", colnames(tsv_td)[-c(1,2)], paste0("RF",seq(0,100,25)) )
metnames <- unique(as.character(xlsx_td[1,]))
metnames <- metnames[!is.na(metnames)]
# parse y from xlsx data
count <- 1
for (col in 3:ncol(xlsx_td)) {
for (im in 1:length(metnames)) {
# col=3
met <- xlsx_td[1,col]
for(row in 4:nrow(xlsx_td)){
sample <- substr(xlsx_td[row,1],1,nchar(xlsx_td[row,1])-5)
rownames(my_data)[count] <- paste0(met," ",sample)
col <- which(xlsx_td[1, ] == metnames[im] )[1]
for (row in 3:nrow(xlsx_td)) {
if (nchar(xlsx_td[row,1]) > 6) {
sample <- substr(xlsx_td[row,1], 1, nchar(xlsx_td[row,1])-5)
}
rownames(my_data)[count] <- paste0(metnames[im]," ",sample)
my_data[count,1] <- xlsx_td[row,col]
count <- count + 1
}
......
......@@ -31,9 +31,6 @@ initialize_prm <- function() {
prm$pathprefix <- '/'
}
# Set up logging (log to file only when running on server) ------------------------------
prm$log_con <- file("R_messages.log",open="a")
# Set global parameters for peak detection
prm$timerange <- c(0,5) # time range in minutes --> will be re-determined based on first sample further down
prm$samplingfrequency <- 2 # samplingfrequency (time resolution of analysis) in Hz
......
......@@ -257,7 +257,7 @@ peak_detection <- function(metab, smpl, prm) {
if (prm$verbose >= 1) {
# tsv is saver in case of comma in metabolite or sample name, like fructore 1,6 bisphosphate
write.table(qslog, file = 'qslog_initial.tsv', sep = '\t', row.names = FALSE)
write.table(qslog, file = file.path(prm$batchdir, 'qslog_initial.tsv'), sep = '\t', row.names = FALSE)
}
msd
......
......@@ -11,7 +11,7 @@ plot_peaks <- function(metab, smpl, msd, prm) {
}else{
pdf_file <- "peakoverview.pdf"
}
pdf(file = pdf_file, height = 2*prm$nsmpl, width = 5* prm$nmet, family = "Helvetica")
pdf(file = file.path(prm$batchdir, pdf_file), height = 2*prm$nsmpl, width = 5* prm$nmet, family = "Helvetica")
par(mai = c(0.5, 0.5, 0.8, 0.5))
layout(matrix(c(1:(prm$nmet*prm$nsmpl)), prm$nsmpl, prm$nmet, byrow = TRUE)) # initiate subplots
......
......@@ -2,18 +2,21 @@
#'
#' This function processes all .mzML files in nowfolder (current folder if empty)
#' using metabolite definitions and sample information provided in the same folder.
#' Output files are written to nowfolder.
#'
#' @param nowfolder folder that contains .mzML files, sample.info and optionally update_prm.R. Default is current working directory
#' @param parameters list of parameters that overwrites defaults defined in initialize_prm()
#'
#' @return msd list metabolites of samples with processed data
#'
#' @export
qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
process_batch <- function(nowfolder = "", parameters = list()) {
# Initialize analysis ------------------------------
ptm <- proc.time()
pardefault <- par(no.readonly = TRUE)
options("scipen"=100, "digits"=4)
originalwd <- getwd()
# originalwd <- getwd()
# Set working directory (in Testmode)
if (nowfolder == '') {
......@@ -21,8 +24,8 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
nowfolder <- getwd()
}else{
if(dir.exists(nowfolder)){
print('setting nowfolder2')
setwd(nowfolder)
# print('setting nowfolder')
# setwd(nowfolder)
}else{
cat(paste0(nowfolder," directory doesn't exist!"))
return(NULL)
......@@ -30,21 +33,32 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
}
# Remove existing log file
system("rm R_messages.log")
# Remove existing log file and create new one
system(paste0("rm ", file.path(nowfolder, "R_messages.log")))
# Initialize prm (global settings)
prm <- initialize_prm()
print('initialize complete')
# create new log file
prm$log_con <- file(file.path(nowfolder, "R_messages.log"),open="a")
# update prm from local settings in mzML folder
update_prm_path <- paste0(nowfolder,"/update_prm.xlsx")
update_prm_path <- file.path(nowfolder,"update_prm.xlsx")
if (file.exists(update_prm_path)){
updatedata <- openxlsx::read.xlsx(update_prm_path)
if (('Variable.in.prm' %in% colnames(updatedata)) && ('Value' %in% colnames(updatedata)) ) {
for (iv in 1:nrow(updatedata)) {
prm[[updatedata[iv,'Variable.in.prm']]] <- updatedata[iv,'Value']
nowvalue <- updatedata[iv,'Value']
if (nowvalue %in% c("TRUE", "FALSE")) {
nowvalue <- as.logical(nowvalue)
} else if ( !is.na(as.numeric(nowvalue)) ) {
nowvalue <- as.numeric(nowvalue)
}
prm[[updatedata[iv,'Variable.in.prm']]] <- nowvalue
}
} else {
print('Error in update_prm.xlsx, cannot find required columns "Variabel in prm" and "Value".')
}
}
......@@ -55,42 +69,40 @@ print('initialize complete')
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
print('prm update complete')
# save nowfolder to prm
prm$batchdir <- nowfolder
if(!prm$runninglocal){ # if running on server
sink(prm$log_con, append=TRUE) # output errors and messages to log file
}
# Log git info ------------------------------
tryCatch({
cat(paste(timestamp(), '\n Now running process_batch.R',
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
scriptgithash <- automrmgithash <- helperfilesgithash <- ''
if (dir.exists('~/data/helperfiles')) {
setwd('~/data/helperfiles')
helperfilesgithash <- system("git describe --always", intern=TRUE)
helperfilesgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/helperfiles/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash), file=prm$log_con)
}
if (dir.exists('~/data/automRm')) {
setwd('~/data/automRm')
automrmgithash <- system("git describe --always", intern=TRUE)
automrmgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/automRm/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
), file=prm$log_con)
}
if (dir.exists('~/code/R_analysis')) {
setwd('~/code/R_analysis')
scriptgithash <- system("git describe --always", intern=TRUE)
}
cat(paste(timestamp(), '\n Now running QQQ_integrate.R
\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash,
'\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash,
'\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
scriptgithash <- system(paste0("git --git-dir=", prm$pathprefix, "code/R_analysis/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash), file=prm$log_con)
}
rm('helperfilesgithash', 'scriptgithash', 'automrmgithash')
}, error = function(err) {
# not in docker container used for development, do nothing
})
print('passed git check')
# Reset working directory
setwd(nowfolder)
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand') ) { #(!(prm$ml_train && (prm$ml_type == 'initial'))) {
......@@ -110,15 +122,15 @@ print('passed git check')
}
# calculate unified RT scale
prm <- get_unirt(list.files(pattern='.mzML')[1], prm)
prm <- get_unirt(file.path(prm$batchdir, list.files(path = prm$batchdir, pattern='.mzML')[1]) , prm)
# Read mzML-files for unirt, chromdata and samplenames (from files or Rdata-file, if present)-----------
if (!file.exists("mzML.rds")) {
smpl <- read_mzmlfiles(list.files(pattern='.mzML'), prm)
saveRDS(smpl, "mzML.rds") # ! NEEDED for testing, because data structure has been changed!
}else{
smpl <- readRDS("mzML.rds")
}
# if (!file.exists("mzML.rds")) {
smpl <- read_mzmlfiles( list.files(path = prm$batchdir, pattern='.mzML'), prm)
# saveRDS(smpl, "mzML.rds") # ! NEEDED for testing, because data structure has been changed!
# }else{
# smpl <- readRDS("mzML.rds")
# }
prm$nsmpl <- length(smpl$chroms)
# get additional info from sample.info
......@@ -136,7 +148,7 @@ print('passed git check')
msd <- peak_detection(metab, smpl, prm)
if (prm$verbose >= 2) {
save(metab, smpl, msd, prm, file='troubleshoot.RData')
save(metab, smpl, msd, prm, file = file.path(prm$batchdir, 'troubleshoot.RData') )
}
# do not proceed in case of initial training
......@@ -181,14 +193,14 @@ print('passed git check')
cat('\n Total R-time for this dataset: ', file=prm$log_con)
cat(proc.time() - ptm , file=prm$log_con)
ptime <- proc.time() - ptm
write(as.numeric(ptime[3]/60),file='R.time')
write(as.numeric(ptime[3]/60),file = file.path(prm$batchdir, 'R.time'))
# for documentation purposes append session info to log. do this at the end because it's a messy read.
cat(as.character(sessionInfo()), file=prm$log_con)
tryCatch({
# Call sample postprocessing if in pipeline mode
if (!(as.character(Sys.info()[["user"]]) == 'rstudio')){
if (!(as.character(Sys.info()[["user"]]) == 'rstudio') && !prm$runninglocal ){
rm(list = ls()) # empty workspace to avoid out of memory options
gc()
# system("chmod 777 overviewheatmap.pdf", intern=FALSE, wait=FALSE)
......@@ -199,13 +211,15 @@ print('passed git check')
print ('qqq_auto_integrate complete.')
# tidy up
setwd(originalwd)
# setwd(originalwd)
par(pardefault)
}, error = function(err) {
# if tidying up doesn't work, well too bad.
})
msd
} # endfunction
......
......@@ -42,7 +42,7 @@ read_mzmlfiles <- function(filelist,prm){
options(warn=-1)
# read file from mzML
mz <- mzR::openMSfile(filelist[filenum], backend = "pwiz")
mz <- mzR::openMSfile(file.path(prm$batchdir,filelist[filenum]), backend = "pwiz")
options(warn=0)
nowlabel <- character(mzR::nChrom(mz)-1)
......
......@@ -6,8 +6,8 @@ read_sampleinfo <- function(smpl, prm) {
cat('\n Reading additional sample info ', file=prm$log_con)
if (file.exists('sample.info')){
sampleinfo <- read.csv('sample.info', stringsAsFactors = FALSE)
if (file.exists(file.path(prm$batchdir, 'sample.info'))){
sampleinfo <- read.csv(file.path(prm$batchdir, 'sample.info'), stringsAsFactors = FALSE)
activetypes <- character()
testtypes <- c('type1','type2','type3','common_name','celltypename','cultivationname')
for (itn in 1:length(testtypes)){
......
......@@ -8,43 +8,25 @@
#' @param model_file_name name of model to be written do disc
#'
#' @export
qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
# # load function to prepare training data -----------------------------------------------------------------
# source('/home/rstudio/code/R_analysis/R_functions/qqq/generate_ml_training_data.R')
# source('/home/rstudio/code/R_analysis/R_functions/qqq/generate_Xy_data_peaks_final.R')
# source('/home/rstudio/code/R_analysis/R_functions/para_opt/nn_functions/model_sensitivity_by_feature.R')
# source('/home/rstudio/code/R_analysis/R_functions/qqq/qqq_auto_integrate.R')
#
# # Load global settings -----------------------------------------------------------------------------------
# source('/home/rstudio/code/R_analysis/R_functions/qqq/initialize_prm.R')
# prm <- initialize_prm()
# # Check if model_type settings match prm settings --------------------------------------------------------
# if( ((model_type == 'pcand') &! (prm$ml_type =="mltrain_initial")) |
# ((model_type == 'finalp') &! (prm$ml_type =="mltrain_pcand")) ){
# cat('\nInconsistent model type settings. Please check initialize_prm.R')
# cat(paste0('\n\t'),"Choosen model type: ",model_type)
# cat(paste0('\n\t'),"prm model type: ",prm$ml_type,'\n')
# stop()
# }
train_model <- function(model_type,base_folder,data_sets,model_file_name){
prm <- list()
if (model_type == 'pcand') {
prm$ml_type <- "mltrain_initial"
} else if (model_type == 'finalp') {
if (!file.exists(prm$model_rf_path)) {
cat('\n R.data-file of 1. model is missing. Please define path to model in update_prm.xlsx')
stop()
} else {
prm$ml_type <- "mltrain_pcand"
}
# # Check if R.data file of 1. model is present -----------------------------------------------------------
# if((model_type == 'finalp') & !file.exists(prm$model_rf_path)){
# cat('\n R.data-file of 1. model is missing. Please check initialize_prm.R')
# }
}
# Training for consensus peak detection -----------------------------------------------------------------
if(model_type=='pcand'){
if (model_type=='pcand') {
for(data in 1:length(data_sets)){
for (data in 1:length(data_sets)) {
cat(paste0("\nCurrent dataset: ",data_sets[data]))
......@@ -60,7 +42,7 @@ qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
cat(paste0('cannot find stored initial QS log in ', base_folder, data_sets[data]))
cat('Processing this data set...')
nowfolder <- paste0(base_folder, data_sets[data])
qqq_auto_integrate(nowfolder, parameters = prm)
process_batch(nowfolder, parameters = prm)
}
# Generate training data
training_data <- generate_ml_training_data(tsv_path, xlsx_path)
......@@ -98,7 +80,7 @@ qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
print(paste0('cannot find stored initial QS log in ', base_folder, data_sets[data] ))
print('Processing this data set...')
nowfolder <- paste0(base_folder, data_sets[data])
qqq_auto_integrate(nowfolder, parameters = prm)
process_batch(nowfolder, parameters = prm)
}
# Generate training data
if (data==1) {
......
......@@ -119,7 +119,7 @@ write_peakinfo <- function(metab, smpl, msd, prm){
openxlsx::writeData(wb,'info', read.csv('R_messages.log', stringsAsFactors = FALSE,row.names=NULL), startRow = 15, rowNames=FALSE, colNames = FALSE)
}
openxlsx::saveWorkbook(wb, "Peakinfo.xlsx", overwrite = TRUE)
openxlsx::saveWorkbook(wb, file.path(prm$batchdir, "Peakinfo.xlsx"), overwrite = TRUE)
cat('...done.\n\n', file=prm$log_con)
system("chmod 777 Peakinfo.xlsx", intern=FALSE, wait=FALSE)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/qqq_auto_integrate.R
\name{qqq_auto_integrate}
\alias{qqq_auto_integrate}
% Please edit documentation in R/process_batch.R
\name{process_batch}
\alias{process_batch}
\title{process set of .mzML files}
\usage{
qqq_auto_integrate(nowfolder = "", parameters = list())
process_batch(nowfolder = "", parameters = list())
}
\arguments{
\item{nowfolder}{folder that contains .mzML files, sample.info and optionally update_prm.R. Default is current working directory}
\item{parameters}{list of parameters that overwrites defaults defined in initialize_prm()}
}
\value{
msd list metabolites of samples with processed data
}
\description{
This function processes all .mzML files in nowfolder (current folder if empty)
using metabolite definitions and sample information provided in the same folder.
Output files are written to nowfolder.
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/qqq_model_train.R
\name{qqq_model_train}
\alias{qqq_model_train}
% Please edit documentation in R/train_model.R
\name{train_model}
\alias{train_model}
\title{training of random forst models}
\usage{
qqq_model_train(model_type, base_folder, data_sets, model_file_name)
train_model(model_type, base_folder, data_sets, model_file_name)
}
\arguments{
\item{model_type}{pcand or finalp to train peak recognition model or peak output model}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment