Dear Gitlab users, due to maintenance reasons, Gitlab will not be available on Thursday 30.09.2021 from 5:00 pm to approximately 5:30 pm.

Commit 653982fb authored by joerg.buescher's avatar joerg.buescher
Browse files

enabeling additional ML options neural net and xgboost

parent 49038d31
......@@ -23,8 +23,10 @@ Imports:
e1071 (>= 1.7-3),
mgcv(>= 1.8-28),
mzR (>= 2.20.0),
neuralnet (>= 1.44.2),
nnls (>= 1.4),
openxlsx (>= 4.1.4),
randomForest (>= 4.6-14),
robustbase(>= 0.93-6),
robustbase (>= 0.93-6),
xgboost (>= 0.82.1),
zoo (>= 1.8-7)
......@@ -229,7 +229,7 @@ for (im in 1:prm$nmet) {
# Transform input data
trans <- predict(prm$train_preprocessing, as.data.frame(qsmat) )
# select best candidate by RF model
pred_qs <- predict(prm$model_rf, trans, type="prob")
pred_qs <- predict(prm$model, trans, type="prob")
bestcand <- which(pred_qs$H == max(pred_qs$H ))[1] # take first candidate if two bestcands are present
}
......@@ -287,7 +287,7 @@ for (im in 1:prm$nmet) {
final_peak_class$H <- mean(as.numeric(nowqs))
}else{
trans <- predict(prm$train_preprocessing, as.data.frame(nowqs) )
final_peak_class <- predict(prm$model_rf, trans,type="prob")
final_peak_class <- predict(prm$model, trans,type="prob")
}
# log quality scores for ML training
......@@ -329,7 +329,7 @@ for (im in 1:prm$nmet) {
msd[[im]][[id]]$label2 <- paste0(label2,
'\n',
' rf_pcand:' , final_peak_class$H)
' ML_pcand:' , round(final_peak_class$H,2))
} # endfor all samples
} # endif metabdb_data$quantifier was measured in refsample
......
evaluate_peaks_rf <- function(msd, prm){
evaluate_peaks <- function(msd, prm){
# Function to evaluate peaks based on final random forest model
# In Trainig mode: all peak candiates are reported
......@@ -18,8 +18,8 @@ evaluate_peaks_rf <- function(msd, prm){
for (im in 1:prm$nmet){
# calculate quantiles for each batch
rf_quantile <- quantile(sapply(msd[[im]], '[[', 'qs')) # quantile(rf_vec)
names(rf_quantile) <- paste0("RF",seq(0,100,25))
ml_quantile <- quantile(sapply(msd[[im]], '[[', 'qs')) # quantile(rf_vec)
names(ml_quantile) <- paste0("RF",seq(0,100,25))
for (id in 1:prm$nsmpl){
......@@ -33,21 +33,21 @@ evaluate_peaks_rf <- function(msd, prm){
cat('\r', paste('evaluate_peaks: metab', im, 'sample', id, ' '))
}
nowrfquantile <- c(data.frame('Output_H' = msd[[im]][[id]]$qs), rf_quantile)
nowquantile <- c(data.frame('Output_H' = msd[[im]][[id]]$qs), ml_quantile)
# msd[[im]][[id]]$rf_quantile <- rf_quantile # do we really ever need this again? -> Good question! -> if yes, it should be saved to metab
# msd[[im]][[id]]$ml_quantile <- ml_quantile # do we really ever need this again? -> Good question! -> if yes, it should be saved to metab
# predict final peak score
final_data <- as.data.frame(c(msd[[im]][[id]]$qs_list, nowrfquantile)) # as.data.frame(t(c(peaks_by_metab_temp[[im]][[id]][["peaks"]][[peak]]$qs_list,rf_quantile)))
final_data <- as.data.frame(c(msd[[im]][[id]]$qs_list, nowquantile)) # as.data.frame(t(c(peaks_by_metab_temp[[im]][[id]][["peaks"]][[peak]]$qs_list,ml_quantile)))
final_data_prep <- predict(prm$train_preprocessing_final,final_data)
final_pred <- predict(prm$model_rf_final,final_data_prep,type="prob")$H
final_pred <- predict(prm$model_final,final_data_prep,type="prob")$H
#update plotdata label1
nowstartpos <- gregexpr('\n', msd[[im]][[id]]$label1)[[1]][2]
msd[[im]][[id]]$label1 <- paste0(substr(msd[[im]][[id]]$label1, 1, nowstartpos), 'QS: ', round(100*final_pred,0) )
#update plotdata label2
quantile_label <- paste0(names(rf_quantile),": ",rf_quantile,collapse=" ")
quantile_label <- paste0(names(ml_quantile),": ",ml_quantile,collapse=" ")
new_label <- paste0(paste("rep: ",final_pred),"\n",quantile_label)
msd[[im]][[id]]$label2 <- paste(msd[[im]][[id]]$label2, new_label)
......
......@@ -48,9 +48,9 @@ generate_Xy_data_peaks_final <- function(xlsx_path,tsv_path){
# get indices of current metab to aggregate rf hit probabilty
metab_indices <- which(tsv_td[,2] == tsv_td[row,2])
rf_quantiles <- quantile(tsv_td[metab_indices, pcandcol])
ml_quantile <- quantile(tsv_td[metab_indices, pcandcol])
my_data[row_index,2:ncol(my_data)] <- c(qs_scores, rf_quantiles)
my_data[row_index,2:ncol(my_data)] <- c(qs_scores, ml_quantile)
}
# Only use subset of mamatrix where peak classification is 2 or 0 (1's are maybe peaks)
......
......@@ -76,7 +76,7 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values$QS_cor12
cor_value <- prm$median_values$QS_cor12
}
}else{
cor_value <- cor(x1[peakstart:peakend],x2[peakstart:peakend],use="everything") * 0.5 + 0.5
......@@ -89,7 +89,8 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values$QS_cor13
cor_value <- prm$
median_values$QS_cor13
}
}else{
cor_value <- cor(x1[peakstart:peakend],x3[peakstart:peakend],use="everything") * 0.5 + 0.5
......@@ -101,7 +102,7 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
cor_value <- 0
}else{
cor_value <- prm$rf_median_values$QS_cor23
cor_value <- prm$median_values$QS_cor23
}
}else{
cor_value <- cor(x2[peakstart:peakend],x3[peakstart:peakend],use="everything") * 0.5 + 0.5
......@@ -124,7 +125,7 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
qual_t_o_value <- 0
}else{
qual_t_o_value <- prm$rf_median_values$QS_T.o2
qual_t_o_value <- prm$median_values$QS_T.o2
}
}else{
qual_t_o_value <- max((max(x2[peakstart:peakend],na.rm=TRUE) / ( (max(x2[peakstart:peakend],na.rm=TRUE) + max(c(x2[1:max(c(peakstart-1),1)],x2[min(c(peakend+1,length(x2))):length(x2)]),na.rm=TRUE)) )),0)
......@@ -145,7 +146,7 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
ratio_value <- 0
}else{
ratio_value <- prm$rf_median_values$QS_1vs2
ratio_value <- prm$median_values$QS_1vs2
}
} else {
ratio_value <- (1 / (1 + abs(log(x1[peaktop] / x2[peaktop], 2) - log(metab[[im]]$quant$intens / metab[[im]]$qual12c$intens,2))^2 ) )^2
......@@ -157,7 +158,7 @@ get_quality_scores <- function(x1, x2, x3, peakstart, peaktop, peakend, nowusetr
if(prm$ml_type == "mltrain_initial"){
qual_t_o_value <- 0
}else{
qual_t_o_value <- prm$rf_median_values$QS_T.o3
qual_t_o_value <- prm$median_values$QS_T.o3
}
}else{
qual_t_o_value <- max((max(x3[peakstart:peakend],na.rm=TRUE) / ( (max(x3[peakstart:peakend],na.rm=TRUE) + max(c(x3[1:max(c(peakstart-1),1)],x3[min(c(peakend+1,length(x3))):length(x3)]),na.rm=TRUE)) )),0)
......
......@@ -58,8 +58,8 @@ initialize_prm <- function() {
# Paths and columns
# prm$RTcolumn <- 'LunaNH2.RT.QQQ'
# prm$excelfile <- paste0(prm$pathprefix, 'data/helperfiles/metabolite_databases/AgilentQQQdb.xlsx') # polar is default
prm$model_rf_path <- paste0(prm$pathprefix, 'data/helperfiles/testdata/LC_machine_learning/rf_model_pcand.Rdata')
prm$model_rf_final_path <- paste0(prm$pathprefix, 'data/helperfiles/testdata/LC_machine_learning/rf_model_finalp.Rdata')
prm$model_path <- paste0(prm$pathprefix, 'data/helperfiles/testdata/LC_machine_learning/model_pcand.Rdata')
prm$model_final_path <- paste0(prm$pathprefix, 'data/helperfiles/testdata/LC_machine_learning/model_finalp.Rdata')
#?
prm$sum12c <- FALSE # sum 12c qual and quant area/height #
......
......@@ -11,6 +11,7 @@
#'
#' @export
process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE) {
# process_batch('~/data/helperfiles/testdata/LC_machine_learning/ccm2/' )
# Initialize analysis ------------------------------
ptm <- proc.time()
......@@ -76,27 +77,27 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
# load random forest models and add to parameter list
if (prm$ml_type %in% c('mlprod', 'mltrain_pcand') ) { #(!(prm$ml_train && (prm$ml_type == 'initial'))) {
if (file.exists(prm$model_rf_path)) {
load(file = prm$model_rf_path) # 1. model to detect best peak in MRM
prm$model_rf <- model_rf_pcand
if (file.exists(prm$model_path)) {
load(file = prm$model_path) # 1. model to detect best peak in MRM
prm$model <- model_pcand
prm$train_preprocessing <- train_preprocessing_pcand
prm$rf_median_values <- model_rf_median_values
prm$median_values <- model_median_values
# only keep what we need
rm('model_rf_pcand', 'model_rf_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
rm('model_pcand', 'model_median_values', 'train_preprocessing_pcand', 'train_dummy_vars_pcand')
} else {
cat(paste0(prm$model_rf_path,": model doesn't exist!"))
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
}
}
if (prm$ml_type == 'mlprod') { #} (!prm$ml_train) {
if (file.exists(prm$model_rf_final_path)) {
load(file = prm$model_rf_final_path)# 2. model to evaluate if peak should be reported
prm$model_rf_final <- model_rf_finalp
if (file.exists(prm$model_final_path)) {
load(file = prm$model_final_path)# 2. model to evaluate if peak should be reported
prm$model_final <- model_finalp
prm$train_preprocessing_final <- train_preprocessing_finalp
# only keep what we need
rm('model_rf_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
rm('model_finalp', 'train_preprocessing_finalp', 'train_dummy_vars_finalp')
} else {
cat(paste0(prm$model_rf_path,": model doesn't exist!"))
cat(paste0(prm$model_path,": model doesn't exist!"))
return(NULL)
}
}
......@@ -158,7 +159,7 @@ process_batch <- function(nowfolder = "", parameters = list(), return_msd=FALSE)
msd <- normalize13c(msd, prm)
# Peak evaluation ------------------------------
msd <- evaluate_peaks_rf(msd, prm)
msd <- evaluate_peaks(msd, prm)
# Plot peak data ------------------------------
plot_peaks(metab, smpl, msd, prm)
......
......@@ -183,7 +183,7 @@ qqq_mrm_integrate <- function(metab, smpl, prm, msd, id, im) {
# Transform input data
trans <- predict(prm$train_preprocessing, as.data.frame(nowqs) ) # as.data.frame(t(nowqs))
# RF Model predictions
qs[i] <- predict(prm$model_rf, trans,type="prob")$H
qs[i] <- predict(prm$model, trans,type="prob")$H
}
# add nowqs to peak parameters
......
......@@ -3,12 +3,14 @@
#' This function trains the random forest models used for peak classification
#'
#' @param model_type pcand or finalp to train peak recognition model or peak output model
#' @param ml_type type of machine learning algorithm to be used. Options are rf or nnet or xgbtree
#' @param base_folder system folder that contains training data in subfolders
#' @param data_sets vector of names of folders in base_folder that contain training data and training solutions
#' @param model_file_name name of model to be written do disc
#'
#' @export
train_model <- function(model_type,base_folder,data_sets,model_file_name){
train_model <- function(model_type, ml_type = 'rf', base_folder = '.', data_sets, model_file_name){
# train_model('finalp', 'rf', '~/data/helperfiles/testdata/LC_machine_learning/', 'ccm2', 'testmodel' )
prm <- list()
if (model_type == 'pcand') {
......@@ -150,32 +152,55 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
verboseIter=TRUE # training log
)
model_rf = caret::train(y ~ .,
data=train_data,
method='rf',
tuneLength=10,
metric="ROC",
ntree=500,
nodesize = 5, # 1 as an alternative
trControl = fitControl)
if (ml_type == 'rf') {
model = caret::train(y ~ .,
data=train_data,
method='rf', # 'xgbTree', 'rf', 'nnet'
tuneLength=10,
metric="ROC",
ntree=500,
nodesize = 5, # 1 as an alternative
trControl = fitControl)
} else if (ml_type == 'nnet') {
model = caret::train(y ~ .,
data=train_data,
method='nnet',
tuneLength=10,
metric="ROC",
ntree=500,
nodesize = 5, # 1 as an alternative
trControl = fitControl)
} else if (ml_type == 'xgbtree') {
model = caret::train(y ~ .,
data=train_data,
method='xgbTree',
tuneLength=10,
metric="ROC",
trControl = fitControl)
} else {
print(paste0('Unknown ML type: ', ml_type))
stop()
}
# Assign random forest model (model,preprocessing,...) ---------------------------------------------
assign(paste0("model_rf","_",model_type), model_rf)
assign(paste0("model","_",model_type), model)
assign(paste0("train_preprocessing","_",model_type), train_preprocessing)
assign(paste0("train_dummy_vars","_",model_type), train_dummy_vars)
assign(paste0("training_data","_",model_type), train_data)
model_objects <- c(paste0("model_rf","_",model_type),paste0("train_preprocessing","_",model_type),paste0("train_dummy_vars","_",model_type))
model_objects <- c(paste0("model","_",model_type),paste0("train_preprocessing","_",model_type),paste0("train_dummy_vars","_",model_type))
# Store median values of quality scores with qualifier information (--> uses as default values )
if(model_type == "pcand"){
qsnames <- names(x1_train_data)
model_rf_median_values <- list()
model_median_values <- list()
for (iq in 1:length(qsnames)) {
model_rf_median_values[[qsnames[iq]]] <- median( x1_train_data[[qsnames[iq] ]][x12_13c_stack[,1][trainRowNumbers1]==TRUE])
model_median_values[[qsnames[iq]]] <- median( x1_train_data[[qsnames[iq] ]][x12_13c_stack[,1][trainRowNumbers1]==TRUE])
}
model_objects <- c(model_objects,"model_rf_median_values")
model_objects <- c(model_objects,"model_median_values")
}
# Save random forest data to file ---------------------------------------------------------------------
model_file_name_prefix <- paste0(model_file_name,"_",model_type)
......@@ -183,7 +208,7 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
# Predict on testData and Compute the confusion matrix ------------------------------------------------
predicted_rf <- predict(model_rf, test_data)
predicted <- predict(model, test_data)
# Plot feature density (based on training data) -------------------------------------------------------
pdf(file = paste0(base_folder,paste0("feature_density_",model_type,".pdf")), height = 6, width = 6, family = "Helvetica")
......@@ -199,7 +224,7 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
# Plot feature density (based on predicted testing data) -----------------------------------------------
pdf(file = paste0(base_folder,paste0("feature_density_model_",model_file_name_prefix,".pdf")), height = 6, width = 6, family = "Helvetica")
fplot <- featurePlot(x = test_data[,1:ncol(test_data)-1],
y = predicted_rf,
y = predicted,
plot = "density",
strip=strip.custom(par.strip.text=list(cex=.7)),
scales = list(x = list(relation="free"),
......@@ -209,29 +234,29 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
# Plot false positives / false negatives -----------------------------------------------------------
false_p <- test_data[which(cbind(predicted_rf,test_data$y)[,1]==2 & cbind(predicted_rf,test_data$y)[,2]==1),]
false_p <- test_data[which(cbind(predicted,test_data$y)[,1]==2 & cbind(predicted,test_data$y)[,2]==1),]
sink(paste0(base_folder,paste0("false_positives_",model_file_name_prefix,".txt")))
print(false_p)
sink()
false_n <- test_data[which(cbind(predicted_rf,test_data$y)[,1]==1 & cbind(predicted_rf,test_data$y)[,2]==2),]
false_n <- test_data[which(cbind(predicted,test_data$y)[,1]==1 & cbind(predicted,test_data$y)[,2]==2),]
sink(paste0(base_folder,paste0("false_negatives_",model_file_name_prefix,".txt")))
print(false_n)
sink()
# Plot model prediction densities (H/F) -----------------------------------------------------------
pdf(file = paste0(base_folder,paste0("prob_rf_",model_file_name_prefix,".pdf")), height = 6, width = 6, family = "Helvetica")
dat <- data.frame(prob=predict(model_rf, test_data,type = "prob")[,2],class=test_data$y)
pdf(file = paste0(base_folder,paste0("prob_",model_file_name_prefix,".pdf")), height = 6, width = 6, family = "Helvetica")
dat <- data.frame(prob=predict(model, test_data,type = "prob")[,2],class=test_data$y)
ggplot(dat, aes(x = prob, fill = class)) + geom_density(alpha = 0.5) + ggtitle("Random Forest")# + ylim(0, 1)
dev.off()
# Plot confusion matrix ----------------------------------------------------------------------------
conf_m <- print(caret::confusionMatrix(reference = test_data$y, data = predicted_rf, mode='everything', positive='H'))
conf_m <- print(caret::confusionMatrix(reference = test_data$y, data = predicted, mode='everything', positive='H'))
sink(paste0(base_folder,paste0("confusion_matrix_",model_file_name_prefix,".txt")))
print(conf_m)
sink()
# Plot predictons based on variable thresholds (H/F) -----------------------------------------------
model_posi <- predict(model_rf, test_data,type = "prob")[,2] # model's possibilities
model_posi <- predict(model, test_data,type = "prob")[,2] # model's possibilities
pred <- factor(rep("F",length(model_posi)),levels=c("F","H"))
thresh <- 0.9
pred[model_posi > thresh] <- factor("H")
......@@ -246,12 +271,12 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
for(thresh in iterations){
pred <- factor(rep("F",length(model_posi)),levels=c("F","H"))
pred[model_posi > thresh] <- factor("H")
cm_rf <- confusionMatrix(reference = truth, data = pred, mode='everything', positive='H')
cm <- confusionMatrix(reference = truth, data = pred, mode='everything', positive='H')
cm_dat[it,1] <- thresh
cm_dat[it,2] <- cm_rf$table[2,1]
cm_dat[it,3] <- cm_rf$table[2,2]
cm_dat[it,4] <- cm_rf$byClass["F1"]
cm_dat[it,5] <- cm_rf$overall["Kappa"]
cm_dat[it,2] <- cm$table[2,1]
cm_dat[it,3] <- cm$table[2,2]
cm_dat[it,4] <- cm$byClass["F1"]
cm_dat[it,5] <- cm$overall["Kappa"]
it <- it + 1
}
pdf(file = paste0(base_folder,paste0("manual_treshold_",model_file_name_prefix,".pdf")), height = 6, width = 6, family = "Helvetica")
......@@ -262,10 +287,10 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
# Plot feature importance (Quality scores ~ model) --------------------------------------------------
varimp_rf <- varImp(model_rf)
varimp_rf$importance <- varimp_rf$importance
varimp <- varImp(model)
varimp$importance <- varimp$importance
pdf(file = paste0(base_folder,paste0("feature_importance_",model_file_name_prefix,".pdf")), height = 6, width = 6, family = "Helvetica")
fiplot <- plot(varimp_rf,main="RF")
fiplot <- plot(varimp,main="RF")
print(fiplot)
dev.off()
......@@ -273,8 +298,13 @@ train_model <- function(model_type,base_folder,data_sets,model_file_name){
# after initial training prepare manual_peakcheck_template.xlsx and plotoverview.pdf
# to enable second round of training
if (model_type=='pcand') {
prm <- list()
prm$ml_type <- "mltrain_pcand"
prm$model_rf_path <- paste0(base_folder, model_file_name_prefix, ".Rdata")
print(base_folder)
print(model_file_name_prefix)
print(model_file_name)
prm$model_path <- paste0(base_folder, model_file_name_prefix, ".Rdata")
for (data in 1:length(data_sets)) {
cat(paste0("\nCurrent dataset: ",data_sets[data]))
nowfolder <- paste0(base_folder, data_sets[data])
......
......@@ -4,11 +4,19 @@
\alias{train_model}
\title{training of random forst models}
\usage{
train_model(model_type, base_folder, data_sets, model_file_name)
train_model(
model_type,
ml_type = "rf",
base_folder = ".",
data_sets,
model_file_name
)
}
\arguments{
\item{model_type}{pcand or finalp to train peak recognition model or peak output model}
\item{ml_type}{type of machine learning algorithm to be used. Options are rf or nnet or xgbtree}
\item{base_folder}{system folder that contains training data in subfolders}
\item{data_sets}{vector of names of folders in base_folder that contain training data and training solutions}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment