Commit e51e8320 authored by Joerg Buescher's avatar Joerg Buescher
Browse files

renaming main functions and fixing generate_Xy_data_peaks_final.R to read...

renaming main functions and fixing generate_Xy_data_peaks_final.R to read manual_peakckeck based on template
parent 24f67a24
No preview for this file type
generate_Xy_data_peaks_final <- function(xlsx_path,tsv_path){
#xlsx with classification
xlsx_td <- openxlsx::read.xlsx(xlsx_path, colNames = FALSE, rowNames = FALSE)
#tsv with quality scores
tsv_td <- read.table(file = tsv_path, sep = '\t', stringsAsFactors = FALSE)
colnames(tsv_td) <- as.character(tsv_td[1, ])
tsv_td <- as.data.frame(tsv_td[-1, ])
tsvqscols <- grep('QS_', colnames(tsv_td))
pcandcol <- which(colnames(tsv_td) == 'Output_H')
tsvnumcol <- c(tsvqscols, pcandcol)
for (ic in tsvnumcol) {
tsv_td[ , ic] <- as.numeric(tsv_td[ , ic])
}
#initialize Xy_data
my_data <- as.data.frame(matrix(rep(NA,(ncol(xlsx_td)-2)*(nrow(xlsx_td)-3)*(ncol(tsv_td)+4)),ncol=ncol(tsv_td)+4))
colnames(my_data) <- c("y", colnames(tsv_td)[-c(1,2)], paste0("RF",seq(0,100,25)) )
metnames <- unique(as.character(xlsx_td[1,]))
metnames <- metnames[!is.na(metnames)]
# parse y from xlsx data
count <- 1
for (col in 3:ncol(xlsx_td)) {
for (im in 1:length(metnames)) {
# col=3
met <- xlsx_td[1,col]
for(row in 4:nrow(xlsx_td)){
sample <- substr(xlsx_td[row,1],1,nchar(xlsx_td[row,1])-5)
rownames(my_data)[count] <- paste0(met," ",sample)
col <- which(xlsx_td[1, ] == metnames[im] )[1]
for (row in 3:nrow(xlsx_td)) {
if (nchar(xlsx_td[row,1]) > 6) {
sample <- substr(xlsx_td[row,1], 1, nchar(xlsx_td[row,1])-5)
}
rownames(my_data)[count] <- paste0(metnames[im]," ",sample)
my_data[count,1] <- xlsx_td[row,col]
count <- count + 1
}
}
# parse quality scores from tsv file
# parse quality scores from tsv file
for (row in 1 : nrow(tsv_td)){
#row <- 4
sample_metab <- paste0(tsv_td[row,2]," ", tsv_td[row,1])
# Get index of sample*metabolites and scrape qs
row_index <- which(rownames(my_data) == sample_metab)
qs_scores<- tsv_td[row, tsvqscols ] # 5:ncol(tsv_td)-1]
# get indices of current metab to aggregate rf hit probabilty
metab_indices <- which(tsv_td[,2] == tsv_td[row,2])
rf_quantiles <- quantile(tsv_td[metab_indices, pcandcol])
my_data[row_index,2:ncol(my_data)] <- c(qs_scores, rf_quantiles)
}
# Only use subset of mamatrix where peak classification is 2 or 0 (1's are maybe peaks)
#my_data <- my_data[my_data[,1] != 1,]
#my_data[my_data[,1] == 1,1] <- 0
# Set y to factor
# Set y to factor
my_data$y <- as.factor(my_data$y)
#Return Xy_data
my_data
}
\ No newline at end of file
}
......@@ -6,8 +6,10 @@
#' @param nowfolder folder that contains .mzML files, sample.info and optionally update_prm.R. Default is current working directory
#' @param parameters list of parameters that overwrites defaults defined in initialize_prm()
#'
#' @return msd list metabolites of samples with processed data
#'
#' @export
qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
process_batch <- function(nowfolder = "", parameters = list()) {
# Initialize analysis ------------------------------
ptm <- proc.time()
......@@ -21,7 +23,7 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
nowfolder <- getwd()
}else{
if(dir.exists(nowfolder)){
print('setting nowfolder2')
print('setting nowfolder')
setwd(nowfolder)
}else{
cat(paste0(nowfolder," directory doesn't exist!"))
......@@ -35,7 +37,6 @@ qqq_auto_integrate <- function(nowfolder = "", parameters = list()) {
# Initialize prm (global settings)
prm <- initialize_prm()
print('initialize complete')
# update prm from local settings in mzML folder
update_prm_path <- paste0(nowfolder,"/update_prm.xlsx")
......@@ -43,8 +44,16 @@ print('initialize complete')
updatedata <- openxlsx::read.xlsx(update_prm_path)
if (('Variable.in.prm' %in% colnames(updatedata)) && ('Value' %in% colnames(updatedata)) ) {
for (iv in 1:nrow(updatedata)) {
prm[[updatedata[iv,'Variable.in.prm']]] <- updatedata[iv,'Value']
nowvalue <- updatedata[iv,'Value']
if (nowvalue %in% c("TRUE", "FALSE")) {
nowvalue <- as.logical(nowvalue)
} else if ( !is.na(as.numeric(nowvalue)) ) {
nowvalue <- as.numeric(nowvalue)
}
prm[[updatedata[iv,'Variable.in.prm']]] <- nowvalue
}
} else {
print('Error in update_prm.xlsx, cannot find required columns "Variabel in prm" and "Value".')
}
}
......@@ -55,42 +64,37 @@ print('initialize complete')
prm[[paranames[ip]]] <- parameters[[paranames[ip]]]
}
}
print('prm update complete')
if(!prm$runninglocal){ # if running on server
sink(prm$log_con, append=TRUE) # output errors and messages to log file
}
# Log git info ------------------------------
tryCatch({
cat(paste(timestamp(), '\n Now running process_batch.R',
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
scriptgithash <- automrmgithash <- helperfilesgithash <- ''
if (dir.exists('~/data/helperfiles')) {
setwd('~/data/helperfiles')
helperfilesgithash <- system("git describe --always", intern=TRUE)
helperfilesgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/helperfiles/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash), file=prm$log_con)
}
if (dir.exists('~/data/automRm')) {
setwd('~/data/automRm')
automrmgithash <- system("git describe --always", intern=TRUE)
automrmgithash <- system(paste0("git --git-dir=", prm$pathprefix, "data/automRm/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
), file=prm$log_con)
}
if (dir.exists('~/code/R_analysis')) {
setwd('~/code/R_analysis')
scriptgithash <- system("git describe --always", intern=TRUE)
scriptgithash <- system(paste0("git --git-dir=", prm$pathprefix, "code/R_analysis/.git/ describe --always"), intern=TRUE)
cat(paste('\n\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash), file=prm$log_con)
}
cat(paste(timestamp(), '\n Now running QQQ_integrate.R
\n Gitlab repository for processing script: https://gitlab.gwdg.de/daniel.eilertz/ms-web
Commit hash: ', scriptgithash,
'\n\n Gitlab repository for helperfiles: https://gitlab.gwdg.de/joerg.buescher/helperfiles
Commit hash: ', helperfilesgithash,
'\n\n Gitlab repository for automRm: https://gitlab.gwdg.de/joerg.buescher/automrm
Commit hash: ', automrmgithash,
'\n\n Current user: ', Sys.info()[["user"]], '\n \n'), file=prm$log_con)
rm('helperfilesgithash', 'scriptgithash', 'automrmgithash')
}, error = function(err) {
# not in docker container used for development, do nothing
})
print('passed git check')
# Reset working directory
setwd(nowfolder)
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand') ) { #(!(prm$ml_train && (prm$ml_type == 'initial'))) {
......@@ -206,6 +210,8 @@ print('passed git check')
# if tidying up doesn't work, well too bad.
})
msd
} # endfunction
......
......@@ -8,43 +8,25 @@
#' @param model_file_name name of model to be written do disc
#'
#' @export
qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
# # load function to prepare training data -----------------------------------------------------------------
# source('/home/rstudio/code/R_analysis/R_functions/qqq/generate_ml_training_data.R')
# source('/home/rstudio/code/R_analysis/R_functions/qqq/generate_Xy_data_peaks_final.R')
# source('/home/rstudio/code/R_analysis/R_functions/para_opt/nn_functions/model_sensitivity_by_feature.R')
# source('/home/rstudio/code/R_analysis/R_functions/qqq/qqq_auto_integrate.R')
#
# # Load global settings -----------------------------------------------------------------------------------
# source('/home/rstudio/code/R_analysis/R_functions/qqq/initialize_prm.R')
# prm <- initialize_prm()
# # Check if model_type settings match prm settings --------------------------------------------------------
# if( ((model_type == 'pcand') &! (prm$ml_type =="mltrain_initial")) |
# ((model_type == 'finalp') &! (prm$ml_type =="mltrain_pcand")) ){
# cat('\nInconsistent model type settings. Please check initialize_prm.R')
# cat(paste0('\n\t'),"Choosen model type: ",model_type)
# cat(paste0('\n\t'),"prm model type: ",prm$ml_type,'\n')
# stop()
# }
train_model <- function(model_type,base_folder,data_sets,model_file_name){
prm <- list()
if (model_type == 'pcand') {
prm$ml_type <- "mltrain_initial"
} else if (model_type == 'finalp') {
prm$ml_type <- "mltrain_pcand"
if (!file.exists(prm$model_rf_path)) {
cat('\n R.data-file of 1. model is missing. Please define path to model in update_prm.xlsx')
stop()
} else {
prm$ml_type <- "mltrain_pcand"
}
}
# # Check if R.data file of 1. model is present -----------------------------------------------------------
# if((model_type == 'finalp') & !file.exists(prm$model_rf_path)){
# cat('\n R.data-file of 1. model is missing. Please check initialize_prm.R')
# }
# Training for consensus peak detection -----------------------------------------------------------------
if(model_type=='pcand'){
if (model_type=='pcand') {
for(data in 1:length(data_sets)){
for (data in 1:length(data_sets)) {
cat(paste0("\nCurrent dataset: ",data_sets[data]))
......@@ -60,7 +42,7 @@ qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
cat(paste0('cannot find stored initial QS log in ', base_folder, data_sets[data]))
cat('Processing this data set...')
nowfolder <- paste0(base_folder, data_sets[data])
qqq_auto_integrate(nowfolder, parameters = prm)
process_batch(nowfolder, parameters = prm)
}
# Generate training data
training_data <- generate_ml_training_data(tsv_path, xlsx_path)
......@@ -98,7 +80,7 @@ qqq_model_train <- function(model_type,base_folder,data_sets,model_file_name){
print(paste0('cannot find stored initial QS log in ', base_folder, data_sets[data] ))
print('Processing this data set...')
nowfolder <- paste0(base_folder, data_sets[data])
qqq_auto_integrate(nowfolder, parameters = prm)
process_batch(nowfolder, parameters = prm)
}
# Generate training data
if (data==1) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment