From 1333a8beab66d929da31db48f997ccc44f2bc393 Mon Sep 17 00:00:00 2001 From: Lina Eisenberg <eisenberg@madonna.cbs.mpg.de> Date: Wed, 20 Sep 2023 21:02:58 +0200 Subject: [PATCH] imputation --- data_preparation/data_preparation_2.R | 40 +++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/data_preparation/data_preparation_2.R b/data_preparation/data_preparation_2.R index a074353..b08f1a8 100644 --- a/data_preparation/data_preparation_2.R +++ b/data_preparation/data_preparation_2.R @@ -350,19 +350,24 @@ sum(is.na(data$hypertension_cat)) #n=0 sum(is.na(data$diabetes_cat)) #n=0 sum(is.na(data$tob2_smoking_status)) #n=68 -#need to impute: TIV (numeric), APOE4 (binary: yes or no), BMI (numeric) and smoking status (categorical with 3 levels) based on our main variables: -#imputation method: multivariate imputation (MICE) --> predictive mean matching (pmm) for continuous data, logreg for binary +#need to impute: TIV (numeric), APOE4 (binary: yes or no), BMI (numeric) and smoking status (categorical with 3 levels: 0=no smoker, 1=previous smoker, 2=current smoker) based on our main variables +#imputation method: multivariate imputation (MICE) --> predictive mean matching (pmm) for continuous data, logreg for binary, which for 3 levels? #main variables: p_che, CRP, IL-6, BMI, TGMV, corWMV, meanCT, HCV, WMH, BA -#CAVE: age and BA are highly correlated! --> BA will not be considered in the imputation +#CAVE: age and BA are highly correlated! --> BA will be excluded from the imputation +#TIV is correlated with brain markers --> excluded +#CRP and IL-6 make problems due to highly skewed data --> excluded #install.packages('mice') #version: 3.16.0 -library('mice') +library(mice) +library(ggplot2) #new df including only those variables needed -mice_imp <- subset(data, select = c("adult_prob_age", "adult_prob_gender", "total_activity", "CRPHS_S_NUM_VALUE", "IL6_S_NUM_VALUE", "bmi_bmi","tob2_smoking_status", "EstimatedTotalIntraCranialVol", "TGMV_adj", "mean_ct", "corWMV_adj", "HCV_adj", "total_wmh_normalized", "d_pv_wmh_ratio", "ApoE4_y_n")) +mice_imp <- subset(data, select = c("adult_prob_age", "adult_prob_gender", "bmi_bmi", "total_activity", "tob2_smoking_status", "TGMV_adj", "mean_ct", "corWMV_adj", "HCV_adj", "total_wmh_normalized", "d_pv_wmh_ratio", "ApoE4_y_n")) #check str(mice_imp) +corm=cor(mice_imp, use = "complete.obs") +corrplot::corrplot(corm) #transform mice_imp$ApoE4_y_n <- as.factor(mice_imp$ApoE4_y_n) @@ -384,28 +389,23 @@ missing_pattern=mice::md.pattern(mice_imp,rotate.names = T) #952 indicates the total number of missing values in the data -#Impute all NAs in this dataframe +#Impute all NAs of BMI, APOE4 and smoking status in this dataframe imp_data <- mice(mice_imp, m = 5, - method = c(" "," ", " ", "pmm", "pmm", "pmm", "polyreg", "pmm", "pmm", "pmm", "pmm", "pmm", "pmm", "pmm", "logreg"), + method = c("", "", "pmm", "", "polyreg", "", "", "", "", "", "", "logreg"), maxit = 10 - ) - -#makes problems, presumably due to multicollinearity??! -#maybe one could wait with it and proceed +) -#check for multicollinearity -library(dplyr) -complete_cases <- complete.cases(mice_imp) -mice_imp_complete <- mice_imp[complete_cases,] +#See results +plot(imp_data) +#check which imp is the best +summary(mice_imp) -numeric_mice_imp <- mice_imp_complete %>% - select_if(is.numeric) -correlation_matrix <- cor(numeric_mice_imp) +# +imp_data$imp$bmi_bmi #--> 5 seems the best fit +imp_data$imp$tob2_smoking_status -threshold <- 0.7 -high_correlation_pairs <- which(correlation_matrix > threshold & correlation_matrix < 1, arr.ind = TRUE) ################################################################################ -- GitLab