From 1333a8beab66d929da31db48f997ccc44f2bc393 Mon Sep 17 00:00:00 2001
From: Lina Eisenberg <eisenberg@madonna.cbs.mpg.de>
Date: Wed, 20 Sep 2023 21:02:58 +0200
Subject: [PATCH] imputation

---
 data_preparation/data_preparation_2.R | 40 +++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/data_preparation/data_preparation_2.R b/data_preparation/data_preparation_2.R
index a074353..b08f1a8 100644
--- a/data_preparation/data_preparation_2.R
+++ b/data_preparation/data_preparation_2.R
@@ -350,19 +350,24 @@ sum(is.na(data$hypertension_cat)) #n=0
 sum(is.na(data$diabetes_cat)) #n=0
 sum(is.na(data$tob2_smoking_status)) #n=68
 
-#need to impute: TIV (numeric), APOE4 (binary: yes or no), BMI (numeric) and smoking status (categorical with 3 levels) based on our main variables:
-#imputation method: multivariate imputation (MICE) --> predictive mean matching (pmm) for continuous data, logreg for binary
+#need to impute: TIV (numeric), APOE4 (binary: yes or no), BMI (numeric) and smoking status (categorical with 3 levels: 0=no smoker, 1=previous smoker, 2=current smoker) based on our main variables
+#imputation method: multivariate imputation (MICE) --> predictive mean matching (pmm) for continuous data, logreg for binary, which for 3 levels?
 #main variables: p_che, CRP, IL-6, BMI, TGMV, corWMV, meanCT, HCV, WMH, BA
-#CAVE: age and BA are highly correlated! --> BA will not be considered in the imputation
+#CAVE: age and BA are highly correlated! --> BA will be excluded from the imputation
+#TIV is correlated with brain markers --> excluded 
+#CRP and IL-6 make problems due to highly skewed data --> excluded 
 
 #install.packages('mice') #version: 3.16.0
-library('mice')
+library(mice)
+library(ggplot2)
 
 #new df including only those variables needed
-mice_imp <- subset(data, select = c("adult_prob_age", "adult_prob_gender", "total_activity", "CRPHS_S_NUM_VALUE", "IL6_S_NUM_VALUE", "bmi_bmi","tob2_smoking_status", "EstimatedTotalIntraCranialVol", "TGMV_adj", "mean_ct", "corWMV_adj", "HCV_adj", "total_wmh_normalized", "d_pv_wmh_ratio", "ApoE4_y_n"))
+mice_imp <- subset(data, select = c("adult_prob_age", "adult_prob_gender", "bmi_bmi", "total_activity", "tob2_smoking_status", "TGMV_adj", "mean_ct", "corWMV_adj", "HCV_adj", "total_wmh_normalized", "d_pv_wmh_ratio", "ApoE4_y_n"))
 
 #check
 str(mice_imp)
+corm=cor(mice_imp, use = "complete.obs")
+corrplot::corrplot(corm)
 
 #transform
 mice_imp$ApoE4_y_n <- as.factor(mice_imp$ApoE4_y_n)
@@ -384,28 +389,23 @@ missing_pattern=mice::md.pattern(mice_imp,rotate.names = T)
 #952 indicates the total number of missing values in the data
 
 
-#Impute all NAs in this dataframe
+#Impute all NAs of BMI, APOE4 and smoking status in this dataframe
 imp_data <- mice(mice_imp, 
                  m = 5, 
-                 method = c(" "," ", " ", "pmm", "pmm", "pmm", "polyreg", "pmm", "pmm", "pmm", "pmm", "pmm", "pmm", "pmm", "logreg"),
+                 method = c("", "", "pmm", "", "polyreg", "", "", "", "", "", "", "logreg"),
                  maxit = 10
-                 )
-
-#makes problems, presumably due to multicollinearity??!
-#maybe one could wait with it and proceed
+)
 
-#check for multicollinearity
-library(dplyr)
-complete_cases <- complete.cases(mice_imp)
-mice_imp_complete <- mice_imp[complete_cases,]
+#See results
+plot(imp_data)
 
+#check which imp is the best
+summary(mice_imp)
 
-numeric_mice_imp <- mice_imp_complete %>%
-  select_if(is.numeric)
-correlation_matrix <- cor(numeric_mice_imp)
+#
+imp_data$imp$bmi_bmi #--> 5 seems the best fit
+imp_data$imp$tob2_smoking_status
 
-threshold <- 0.7
-high_correlation_pairs <- which(correlation_matrix > threshold & correlation_matrix < 1, arr.ind = TRUE)
 
 
 ################################################################################
-- 
GitLab