Commit 2b5a06d0 authored by ronja.thieleking's avatar ronja.thieleking
Browse files

added more quality checks and the option to exclude extreme outliers only

parent 87a7d982
No preview for this file type
),
##### attribute standard portion size (in grams) to quantity question items (according to original scoring manual by RKI)
std_size = case_when(
item_no == 46 ~ 5,
item_no == 51 ~ 30,
item_no == 61 ~ 60,
item_no == 70 ~ 120,
item_no == 98 ~ 175,
item_no == 104 ~ 350,
item_no == 112 ~ 16,
item_no == 116 ~ 40,
item_no == 120 ~ 25,
item_no %in% c(2,5,7,9,12,15,32,54) ~ 200,
item_no %in% c(17,20,23,63,68,83,85,87,89,91,96,100,102) ~ 150,
item_no %in% c(26,28) ~ 330,
item_no %in% c(30,94) ~ 125,
item_no %in% c(34,36,73,76) ~ 20,
item_no %in% c(38,40,42,44,118) ~ 50,
item_no %in% c(48,108) ~ 15,
item_no %in% c(57,59) ~ 10,
item_no %in% c(78,80) ~ 90,
item_no %in% c(106,110) ~ 100,
item_no %in% c(66,114) ~ 75,
TRUE ~ 1 ## if not quantity question set std_size to 1
)) %>%
##### transform frequency answers to frequency values per month
mutate(answer = case_when(
frequency == 0 ~ answer,
answer == 1 ~ 0, ## never
answer == 2 ~ 1, ## 1x a month
answer == 3 ~ 2.5, ## 2-3x a month
answer == 4 ~ 6, ### 1-2x a week
answer == 5 ~ 14, ### 3-4x a week
answer == 6 ~ 22, ### 5-6x a week
answer == 7 ~ 28, ### once a day
answer == 8 ~ 56, ### twice a day
answer == 9 ~ 84, ### 3x a day
answer == 10 ~ 126, ### 4-5x a day
answer == 11 ~ 168, ### more than 5x a day
TRUE ~ answer
)) %>%
##### for Quantity & Form Questions set variable "Form"/"type" to 1
mutate(
type = case_when(
frequency != 1 & std_size == 1 ~ 1,
TRUE ~ 0
),
##### create variable which indicates the answer code (Antwortkategorie 1-4) which corresponds to standard size of 1 portion
answer_for_one_portion = case_when(
item_no == 110 ~ 4,
item_no %in% c(36,38,63,70,78,80,85,87,
89,91,94,96,100,102,104,
112,116,118,120) ~ 3,
item_no %in% c(2,5,7,9,12,15,32,54,17,
20,23,26,28,34,40,42,44,
48,108,46,51,61,66,114,68,
73,76,83,98,106,108,114) ~ 2,
item_no %in% c(30,57,59) ~ 1,
TRUE ~ 0
)) %>%
### for Form/type & Quantity items: compute amount consumed in relation to 1 portion based on answer code and re-code answers accordingly
mutate(
answer = case_when(
answer_for_one_portion == 0 ~ answer,
answer == -999 ~ answer,
answer_for_one_portion == answer ~ 1,
answer_for_one_portion == 1 ~ answer,
answer_for_one_portion == 2 & answer == 1 ~ 0.5,
answer_for_one_portion == 2 & answer > 2 ~ answer-1,
answer_for_one_portion == 3 & answer == 2 ~ 0.5,
answer_for_one_portion == 3 & answer == 1 ~ 0.25,
answer_for_one_portion == 3 & answer > 3 ~ answer-2,
answer_for_one_portion == 4 & answer == 1 ~ 0.125,
answer_for_one_portion == 4 & answer == 2 ~ 0.25,
answer_for_one_portion == 4 & answer == 3 ~ 0.5,
answer_for_one_portion == 4 & answer > 4 ~ answer-3,
TRUE ~ answer
)
)
#### for quantity questions (not type or frequency) multiply recoded answer * standard portion size
FFQ_sumscores <- FFQ_subscales %>%
filter(food_group != "Form") %>%
group_by(subject) %>%
mutate(
answer = case_when(
answer_for_one_portion == 0 ~ answer, ## does not concern the frequency questions
answer == -999 ~ answer, ## does not concern the type questions
TRUE ~ answer*std_size ### answer: fraction of the standard size; standard size is given in grams
)) %>%
group_by(subject, food_group) %>%
#### compute mean daily intake (divided by 28 days)
summarise(mean_daily_portion_g = prod(answer)/28) ### for each subject and food group: prod(answer) = answer(freq) * answer (quantity)
FFQ_scoring <- list(FFQ_subscales, FFQ_sumscores)
return(FFQ_scoring)
}
results_list <- score.FFQ(d_long)
### write results ----
write.csv(results_list[[1]], "FFQ_mean_daily_portion_raw.csv", row.names = F)
write.csv(results_list[[2]], "FFQ_mean_daily_portion_results.csv", row.names = F)
### import results for nutrient scoring ----
data_FFQ_raw <- read.csv("FFQ_mean_daily_portion_raw.csv", header =T)
data_FFQ <- read.csv("FFQ_mean_daily_portion_results.csv", header =T)
data_FFQ_wide <- data_FFQ %>%
pivot_wider(names_from = food_group, values_from = mean_daily_portion_g)
#### find and curate incorrect mean daily portions ---------------
### FFQ values < 0 (meaning freq was indicated but amount was not, multiplied by -999)
neg_values=data_FFQ[data_FFQ$mean_daily_portion_g < 0 & !is.na(data_FFQ$mean_daily_portion_g),]
#### longitudinal designs (here 4 time points with 2 baseline-follow up pairs) use the following code
#### if 1 time point missing: calculate mean of remaining 3 time points
#### if at least 2 time points missing: exclude baseline-follow up pair
# neg_food_group=list()
# mean_of_3_tp=c(rep(NA, length(neg_values$subject)))
#
......@@ -140,7 +30,8 @@ neg_values=data_FFQ[data_FFQ$mean_daily_portion_g < 0 & !is.na(data_FFQ$mean_dai
# }
#
# data_FFQ[is.na(data_FFQ$mean_daily_portion_g),] ### shows which BL-FU-pair was removed
### cross-sectional data: replace missing values with population mean
##### missing values in cross-sectional data -------------
##### replace missing values with population mean
# create df with only positive values to calculate population means
data_FFQ_wide_pos <- data_FFQ_wide %>%
mutate_all(funs(replace(., .<0, NA)))
......@@ -370,7 +261,8 @@ data_FFQ$Omega3_mg[data_FFQ$ffq_item == item] <- data_FFQ$mean_daily_portion[dat
data_FFQ$Omega6_mg[data_FFQ$ffq_item == item] <- data_FFQ$mean_daily_portion[data_FFQ$ffq_item == item] * as.numeric(data_BLS$`FO6/Omega-6-Fettsäuren_mg/100g`[data_BLS$ffq_item == item]) / 100
}
}
### scoring for items with food subgroups (cross-sectional data) -----
#### scoring for items with food subgroups -----
##### cross-sectional data ---------
#### at first all food subgroups with answers (not NA, not -999)
for(i in 1:length(data_FFQ$subject)){
subject = data_FFQ$subject[i]
......@@ -398,7 +290,7 @@ data_FFQ$Omega3_mg[i] <- mean_daily_portion * as.numeric(data_BLS$`FO3/Omega-3-F
data_FFQ$Omega6_mg[i] <- mean_daily_portion * as.numeric(data_BLS$`FO6/Omega-6-Fettsäuren_mg/100g`[data_BLS$ffq_item == item & data_BLS$answer_ffq == food_subgroup_answer]) / 100
}
}
#### scoring for items with food subgroups (longitudinal data) -----
##### longitudinal data -----
##### if food subgroup without answer (= -999), then check other timepoints for answers
# for(i in 1:length(data_FFQ$subject)){
# subject = data_FFQ$subject[i]
......@@ -498,15 +390,123 @@ data_FFQ$Omega6_mg[i] <- mean_daily_portion * as.numeric(data_BLS$`FO6/Omega-6-F
# data_FFQ[i,5:22] <- 0 ### set all nutrient values to 0 (check your column numbers and edit respectively!!!)
# }
# }
data_FFQ
d
d$FFQ1.Wie.oft.haben.Sie.in.der.letzten.Woche.Milch..einschließlich.Milch.für.Kaffee..Müsli..getrunken.
str(d$FFQ1.Wie.oft.haben.Sie.in.der.letzten.Woche.Milch..einschließlich.Milch.für.Kaffee..Müsli..getrunken.)
summary(d$FFQ1.Wie.oft.haben.Sie.in.der.letzten.Woche.Milch..einschließlich.Milch.für.Kaffee..Müsli..getrunken.)
unique(d$FFQ1.Wie.oft.haben.Sie.in.der.letzten.Woche.Milch..einschließlich.Milch.für.Kaffee..Müsli..getrunken.)
packageVersion(rstatix)
library(rstatix)
packageVersion(rstatix)
library(rstatix)
packageVersion(rstatix)
packageVersion("rstatix")
### export final dataframes ------
write.table(data_FFQ, paste0("FFQ_mean_daily_portion_results_including_nutrients.csv"), sep = ",", row.names=F)
### calculate sum nutrient values per participant (per timepoint) ------
##### cross-sectional data:
nutrients_FFQ = data.frame(subject=unique(data_FFQ$subject), timepoint = 1)
##### longitudinal data:
# nutrients_FFQ = data.frame(subject=rep(unique(data_FFQ$subject),each=length(unique(data_FFQ$timepoint))), timepoint = rep(unique(data_FFQ$timepoint),each=length(unique(data_FFQ$subject))))
nutrients_FFQ[,3:20] = NA
names(nutrients_FFQ)[3] = "Energy_kcal"
names(nutrients_FFQ)[4] = "Protein_g"
names(nutrients_FFQ)[5] = "Fat_g"
names(nutrients_FFQ)[6] = "Carbohydrates_g"
names(nutrients_FFQ)[7] = "Fibre_g"
names(nutrients_FFQ)[8] = "Sugar_g"
names(nutrients_FFQ)[9] = "Cellulose_mg"
names(nutrients_FFQ)[10] = "Lignin_mg"
names(nutrients_FFQ)[11] = "Soluble_Fibre_mg"
names(nutrients_FFQ)[12] = "Insoluble_Fibre_mg"
names(nutrients_FFQ)[13] = "Tyrosin_mg"
names(nutrients_FFQ)[14] = "Tryptophan_mg"
names(nutrients_FFQ)[15] = "Sat_FA_mg"
names(nutrients_FFQ)[16] = "SCFA_mg"
names(nutrients_FFQ)[17] = "MCFA_mg"
names(nutrients_FFQ)[18] = "LCFA_mg"
names(nutrients_FFQ)[19] = "Omega3_mg"
names(nutrients_FFQ)[20] = "Omega6_mg"
### add sum of each macronutrient per subject
str(data_FFQ$Energy_kcal)
for(i in 1:length(nutrients_FFQ$subject)){
k=4 ### set to 4 to start from 5 in loop
for(j in 3:20){ ### number of nutrient output columns
k=k+1 ### number of nutrient input columns in old dataframe
#print(paste("i=", i, "j=", j, "k=", k))
nutrients_FFQ[i,j] = sum(data_FFQ[data_FFQ$subject == nutrients_FFQ$subject[i], k], na.rm = T)
}
}
#### replace zeros with NAs
nutrients_FFQ[nutrients_FFQ$Energy_kcal==0,3:20] <- NA
### check for outliers -------
outliers <- nutrients_FFQ %>%
#group_by(timepoint,intervention) %>%
identify_outliers(Energy_kcal)
##### remove outliers if desired ---------
# nutrients_FFQ_cleaned <- nutrients_FFQ[-which(nutrients_FFQ$subject %in% outliers$subject),]
### add nutrient adjusted for energy with residual method by Willett ----------------
### (Willett, W. C. & Howe, R. Adjustment for total energy intake in epidemiologic studies. Am J Clin Nutr 65, 1220S–8S (1997).)
fit <- lm(Protein_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Protein_g <- residuals(fit) # Save the residual values
fit <- lm(Fat_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Fat_g <- residuals(fit) # Save the residual values
fit <- lm(Carbohydrates_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Carbohydrates_g <- residuals(fit) # Save the residual values
fit <- lm(Fibre_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Fibre_g <- residuals(fit) # Save the residual values
fit <- lm(Sugar_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Sugar_g <- residuals(fit) # Save the residual values
fit <- lm(Cellulose_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Cellulose_mg <- residuals(fit) # Save the residual values
fit <- lm(Lignin_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Lignin_mg <- residuals(fit) # Save the residual values
fit <- lm(Soluble_Fibre_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Soluble_Fibre_mg <- residuals(fit) # Save the residual values
fit <- lm(Insoluble_Fibre_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Insoluble_Fibre_mg <- residuals(fit) # Save the residual values
fit <- lm(Tyrosin_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Tyrosin_mg <- residuals(fit) # Save the residual values
fit <- lm(Tryptophan_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Tryptophan_mg <- residuals(fit) # Save the residual values
fit <- lm(Sat_FA_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Sat_FA_mg <- residuals(fit) # Save the residual values
fit <- lm(SCFA_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_SCFA_mg <- residuals(fit) # Save the residual values
fit <- lm(MCFA_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_MCFA_mg <- residuals(fit) # Save the residual values
fit <- lm(LCFA_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_LCFA_mg <- residuals(fit) # Save the residual values
fit <- lm(Omega3_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Omega3_mg <- residuals(fit) # Save the residual values
fit <- lm(Omega6_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Omega6_mg <- residuals(fit) # Save the residual values
####### export final dataframes ------
### if outliers removed:
# write.table(nutrients_FFQ_cleaned, "FFQ_mean_daily_portion_results_including_nutrients+residuals.csv", sep = ",", row.names=F)
### if not:
write.table(nutrients_FFQ, "FFQ_mean_daily_portion_results_including_nutrients+residuals.csv", sep = ",", row.names=F)
nutrients_FFQ <- read.csv("FFQ_mean_daily_portion_results_including_nutrients+residuals.csv")
nutrients_FFQ %>%
#group_by(timepoint,intervention) %>%
shapiro_test(Energy_kcal, Protein_g, Fibre_g, Carbohydrates_g, Fat_g, Sugar_g)
### correlation matrix ----
M <- nutrients_FFQ[, c("Energy_kcal","Protein_g","Fat_g","Carbohydrates_g","Sugar_g","Fibre_g")]
M <- M[complete.cases(M),] ### to remove NAs
M <- cor(M, method = "spearman")#### method options: c("pearson", "kendall", "spearman")
png("./Figures/corrplot_macronutrients_FFQ.png", width = 1000, height = 1000, pointsize=16, res = 170)
corrplot(M, method = "color",
type = "upper", order = "original", number.cex = .7,
addCoef.col = "black", # Add coefficient of correlation
tl.col = "black", tl.srt = 90, # Text label color and rotation
diag = FALSE)
dev.off()
h1 <- ggplot(data = nutrients_FFQ, aes(x=Energy_kcal)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
h2 <- ggplot(data = nutrients_FFQ, aes(x=Protein_g)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
h3 <- ggplot(data = nutrients_FFQ, aes(x=Fat_g)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
h4 <- ggplot(data = nutrients_FFQ, aes(x=Carbohydrates_g)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
h5 <- ggplot(data = nutrients_FFQ, aes(x=Fibre_g)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
h6 <- ggplot(data = nutrients_FFQ, aes(x=Sugar_g)) +
geom_histogram(col="black", alpha=.5, bins=20) +
ylab("Frequency") + theme_bw() + scale_fill_npg()
(h <- ggarrange(h1, h2, h3, h4, h5, h6))
ggexport(h, filename = "./Figures/hist_nutrients.png", width = 1200, height = 600, pointsize = 18, res = 220,verbose = TRUE)
Ronja Thieleking,thieleking,fermium.cbs.mpg.de,03.11.2022 10:30,file:///data/hu_thieleking/.config/libreoffice/4;
\ No newline at end of file
File mode changed from 100644 to 100755
This diff is collapsed.
This diff is collapsed.
Figures/corrplot_macronutrients_FFQ.png

79.8 KB | W: | H:

Figures/corrplot_macronutrients_FFQ.png

78.9 KB | W: | H:

Figures/corrplot_macronutrients_FFQ.png
Figures/corrplot_macronutrients_FFQ.png
Figures/corrplot_macronutrients_FFQ.png
Figures/corrplot_macronutrients_FFQ.png
  • 2-up
  • Swipe
  • Onion skin
Figures/hist_nutrients.png

34.1 KB | W: | H:

Figures/hist_nutrients.png

33.7 KB | W: | H:

Figures/hist_nutrients.png
Figures/hist_nutrients.png
Figures/hist_nutrients.png
Figures/hist_nutrients.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -279,7 +279,7 @@ data_FFQ_wide <- data_FFQ %>%
pivot_wider(names_from = food_group, values_from = mean_daily_portion_g)
#### find and curate incorrect mean daily portions ---------------
### find and curate incorrect mean daily portions ---------------
### FFQ values < 0 (meaning freq was indicated but amount was not, multiplied by -999)
neg_values=data_FFQ[data_FFQ$mean_daily_portion_g < 0 & !is.na(data_FFQ$mean_daily_portion_g),]
......@@ -324,168 +324,30 @@ neg_values=data_FFQ[data_FFQ$mean_daily_portion_g < 0 & !is.na(data_FFQ$mean_dai
##### missing values in cross-sectional data -------------
##### replace missing values with population mean
# create df with only positive values to calculate population means
##### replace missing values (frequency given but no amount) with population mean
##### create df with only positive values to calculate population means
data_FFQ_wide_pos <- data_FFQ_wide %>%
mutate_all(funs(replace(., .<0, NA)))
### if frequency given but no amount => replace with population mean
### calculate means for population only from positive values
mean_alcohol_mix_drinks <- mean(data_FFQ_wide_pos$`alkohol. Mischgetränke`, na.rm = TRUE)
mean_nonalcoholic_beer <- mean(data_FFQ_wide_pos$`alkoholfr. Bier`, na.rm = TRUE)
mean_beer <- mean(data_FFQ_wide_pos$Bier, na.rm = TRUE)
mean_bratwurst <- mean(data_FFQ_wide_pos$Bratwurst, na.rm = TRUE)
mean_butter <- mean(data_FFQ_wide_pos$`Butter/ Magarine`, na.rm = TRUE)
mean_cornflakes <- mean(data_FFQ_wide_pos$Cornflakes, na.rm = TRUE)
mean_doener <- mean(data_FFQ_wide_pos$`Döner (Fleisch)`, na.rm = TRUE)
mean_eggs <- mean(data_FFQ_wide_pos$Eier, na.rm = TRUE)
mean_icecream <- mean(data_FFQ_wide_pos$Eis, na.rm = TRUE)
mean_fish_warm <- mean(data_FFQ_wide_pos$`Fisch als warme Mahlzeit`, na.rm = TRUE)
mean_meat <- mean(data_FFQ_wide_pos$Fleisch, na.rm = TRUE)
mean_fruits <- mean(data_FFQ_wide_pos$`frisches Obst`, na.rm = TRUE)
mean_creamcheese <- mean(data_FFQ_wide_pos$Frischkäse, na.rm = TRUE)
mean_fruittea <- mean(data_FFQ_wide_pos$`Früchte-/ Kräutertee`, na.rm = TRUE)
mean_juice <- mean(data_FFQ_wide_pos$Fruchtsaft, na.rm = TRUE)
mean_poultry <- mean(data_FFQ_wide_pos$Geflügel, na.rm = TRUE)
mean_vegetables_cooked <- mean(data_FFQ_wide_pos$`gegartes Gemüse`, na.rm = TRUE)
mean_potatoes_fried <- mean(data_FFQ_wide_pos$`gebratene Kartoffeln`, na.rm = TRUE)
mean_fruits_cooked <- mean(data_FFQ_wide_pos$`gegartes Obst`, na.rm = TRUE)
mean_potatoes_cooked <- mean(data_FFQ_wide_pos$`gekochte Kartoffeln`, na.rm = TRUE)
mean_vegetable_juice <- mean(data_FFQ_wide_pos$Gemüsesaft, na.rm = TRUE)
mean_graubrot <- mean(data_FFQ_wide_pos$Graubrot, na.rm = TRUE)
mean_schnaps <- mean(data_FFQ_wide_pos$Hochprozentiges, na.rm = TRUE)
mean_honey <- mean(data_FFQ_wide_pos$`Honig/ Marmelade`, na.rm = TRUE)
mean_pulses <- mean(data_FFQ_wide_pos$Hülsenfrüchte, na.rm = TRUE)
mean_coffee <- mean(data_FFQ_wide_pos$Kaffee, na.rm = TRUE)
mean_calorie_red_drinks <- mean(data_FFQ_wide_pos$`kalorienr. Erfrischungsgetränke`, na.rm = TRUE)
mean_fish_cold <- mean(data_FFQ_wide_pos$`kalter Fisch`, na.rm = TRUE)
mean_crisps <- mean(data_FFQ_wide_pos$Kartoffelchips, na.rm = TRUE)
mean_cheese <- mean(data_FFQ_wide_pos$Käse, na.rm = TRUE)
mean_cookies <- mean(data_FFQ_wide_pos$Kekse, na.rm = TRUE)
mean_cakes <- mean(data_FFQ_wide_pos$`Kuchen/ Backwaren`, na.rm = TRUE)
mean_cookies <- mean(data_FFQ_wide_pos$Kekse, na.rm = TRUE)
mean_milk <- mean(data_FFQ_wide_pos$Milch, na.rm = TRUE)
mean_cookies <- mean(data_FFQ_wide_pos$Müsli, na.rm = TRUE)
mean_pasta <- mean(data_FFQ_wide_pos$Nudeln, na.rm = TRUE)
mean_nutspread <- mean(data_FFQ_wide_pos$`Nuss-Nougatcreme`, na.rm = TRUE)
mean_nuts <- mean(data_FFQ_wide_pos$Nüsse, na.rm = TRUE)
mean_pizza <- mean(data_FFQ_wide_pos$Pizza, na.rm = TRUE)
mean_chips <- mean(data_FFQ_wide_pos$`Pommes Frites`, na.rm = TRUE)
mean_yoghurt <- mean(data_FFQ_wide_pos$`Quark/ Joghurt`, na.rm = TRUE)
mean_rice <- mean(data_FFQ_wide_pos$Reis, na.rm = TRUE)
mean_vegetables_raw <- mean(data_FFQ_wide_pos$`rohes Gemüse`, na.rm = TRUE)
mean_snacks <- mean(data_FFQ_wide_pos$Salzgebäck, na.rm = TRUE)
mean_ham <- mean(data_FFQ_wide_pos$Schinken, na.rm = TRUE)
mean_choc <- mean(data_FFQ_wide_pos$Schokolade, na.rm = TRUE)
mean_tea <- mean(data_FFQ_wide_pos$`schwarzer/ grüner Tee`, na.rm = TRUE)
mean_sweets <- mean(data_FFQ_wide_pos$Süßigkeiten, na.rm = TRUE)
mean_bread_whole <- mean(data_FFQ_wide_pos$Vollkornbrot, na.rm = TRUE)
mean_water <- mean(data_FFQ_wide_pos$Wasser, na.rm = TRUE)
mean_wine <- mean(data_FFQ_wide_pos$`Wein/ Sekt`, na.rm = TRUE)
mean_bread_white <- mean(data_FFQ_wide_pos$Weißbrot, na.rm = TRUE)
mean_wurst <- mean(data_FFQ_wide_pos$Wurst, na.rm = TRUE)
mean_lemonade <- mean(data_FFQ_wide_pos$`zuckerh. Erfrischungsgetränke`, na.rm = TRUE)
#### replace negative values in original dataframe
for(i in 1:length(data_FFQ_wide$subject)){
if (data_FFQ_wide$`Butter/ Magarine`[i] < 0 & !is.na(data_FFQ_wide$`Butter/ Magarine`[i])) {
data_FFQ_wide$`Butter/ Magarine`[i] <- mean_butter
}
if (data_FFQ_wide$Frischkäse[i]< 0 & !is.na(data_FFQ_wide$Frischkäse[i])) {
data_FFQ_wide$Frischkäse[i] <- mean_creamcheese
}
if (data_FFQ_wide$`kalorienr. Erfrischungsgetränke`[i]< 0 & !is.na(data_FFQ_wide$`kalorienr. Erfrischungsgetränke`[i])) {
data_FFQ_wide$`kalorienr. Erfrischungsgetränke`[i] <- mean_calorie_red_drinks
}
if (data_FFQ_wide$Käse[i]< 0 & !is.na(data_FFQ_wide$Käse[i])) {
data_FFQ_wide$Käse[i] <- mean_cheese
}
if (data_FFQ_wide$Milch[i]< 0 & !is.na(data_FFQ_wide$Milch[i])) {
data_FFQ_wide$Milch[i] <- mean_milk
}
if (data_FFQ_wide$Hochprozentiges[i]< 0 & !is.na(data_FFQ_wide$Hochprozentiges[i])) {
data_FFQ_wide$Hochprozentiges[i] <- mean_schnaps
}
if (data_FFQ_wide$`Nuss-Nougatcreme`[i]< 0 & !is.na(data_FFQ_wide$`Nuss-Nougatcreme`[i])) {
data_FFQ_wide$`Nuss-Nougatcreme`[i] <- mean_nutspread
}
if (data_FFQ_wide$`zuckerh. Erfrischungsgetränke`[i]< 0 & !is.na(data_FFQ_wide$`zuckerh. Erfrischungsgetränke`[i])) {
data_FFQ_wide$`zuckerh. Erfrischungsgetränke`[i] <- mean_lemonade
}
if (data_FFQ_wide$`alkohol. Mischgetränke`[i]< 0 & !is.na(data_FFQ_wide$`alkohol. Mischgetränke`[i])) {
data_FFQ_wide$`alkohol. Mischgetränke`[i] <- mean_alcohol_mix_drinks
}
if (data_FFQ_wide$Bier[i]< 0 & !is.na(data_FFQ_wide$Bier[i])) {
data_FFQ_wide$Bier[i] <- mean_beer
}
if (data_FFQ_wide$Eis[i]< 0 & !is.na(data_FFQ_wide$Eis[i])) {
data_FFQ_wide$Eis[i] <- mean_icecream
}
if (data_FFQ_wide$`Fisch als warme Mahlzeit`[i]< 0 & !is.na(data_FFQ_wide$`Fisch als warme Mahlzeit`[i])) {
data_FFQ_wide$`Fisch als warme Mahlzeit`[i] <- mean_fish_warm
}
if (data_FFQ_wide$`gebratene Kartoffeln`[i]< 0 & !is.na(data_FFQ_wide$`gebratene Kartoffeln`[i])) {
data_FFQ_wide$`gebratene Kartoffeln`[i] <- mean_potatoes_fried
}
if (data_FFQ_wide$`gegartes Gemüse`[i]< 0 & !is.na(data_FFQ_wide$`gegartes Gemüse`[i])) {
data_FFQ_wide$`gegartes Gemüse`[i] <- mean_vegetables_cooked
}
if (data_FFQ_wide$Hülsenfrüchte[i]< 0 & !is.na(data_FFQ_wide$Hülsenfrüchte[i])) {
data_FFQ_wide$Hülsenfrüchte[i] <- mean_pulses
}
if (data_FFQ_wide$`kalter Fisch`[i]< 0 & !is.na(data_FFQ_wide$`kalter Fisch`[i])) {
data_FFQ_wide$`kalter Fisch`[i] <- mean_fish_cold
}
if (data_FFQ_wide$Schinken[i]< 0 & !is.na(data_FFQ_wide$Schinken[i])) {
data_FFQ_wide$Schinken[i] <- mean_ham
#### loop over all columns j starting with "alk. Mischgetränke = column no. 2
for(j in 2:length(colnames(data_FFQ_wide))){
population_mean <- mean(as.numeric(unlist(data_FFQ_wide_pos[,j])), na.rm = TRUE)
for(i in 1:length(data_FFQ_wide$subject)){
if (data_FFQ_wide[i,j] < 0 & !is.na(data_FFQ_wide[i,j])) {
data_FFQ_wide[i,j] <- population_mean
}
if (data_FFQ_wide$Süßigkeiten[i]< 0 & !is.na(data_FFQ_wide$Süßigkeiten[i])) {
data_FFQ_wide$Süßigkeiten[i] <- mean_sweets
}
if (data_FFQ_wide$`Früchte-/ Kräutertee`[i]< 0 & !is.na(data_FFQ_wide$`Früchte-/ Kräutertee`[i])) {
data_FFQ_wide$`Früchte-/ Kräutertee`[i] <- mean_fruittea
}
if (data_FFQ_wide$`schwarzer/ grüner Tee`[i]< 0 & !is.na(data_FFQ_wide$`schwarzer/ grüner Tee`[i])) {
data_FFQ_wide$`schwarzer/ grüner Tee`[i] <- mean_tea
}
if (data_FFQ_wide$Wasser[i]< 0 & !is.na(data_FFQ_wide$Wasser[i])) {
data_FFQ_wide$Wasser[i] <- mean_water
}
if (data_FFQ_wide$Graubrot[i]< 0 & !is.na(data_FFQ_wide$Graubrot[i])) {
data_FFQ_wide$Graubrot[i] <- mean_graubrot
}
if (data_FFQ_wide$Geflügel[i]< 0 & !is.na(data_FFQ_wide$Geflügel[i])) {
data_FFQ_wide$Geflügel[i] <- mean_poultry
}
if (data_FFQ_wide$`Honig/ Marmelade`[i]< 0 & !is.na(data_FFQ_wide$`Honig/ Marmelade`[i])) {
data_FFQ_wide$`Honig/ Marmelade`[i] <- mean_honey
}
if (data_FFQ_wide$`alkoholfr. Bier`[i]< 0 & !is.na(data_FFQ_wide$`alkoholfr. Bier`[i])) {
data_FFQ_wide$`alkoholfr. Bier`[i] <- mean_nonalcoholic_beer
}
if (data_FFQ_wide$Nüsse[i]< 0 & !is.na(data_FFQ_wide$Nüsse[i])) {
data_FFQ_wide$Nüsse[i] <- mean_nuts
}
if (data_FFQ_wide$Pizza[i]< 0 & !is.na(data_FFQ_wide$Pizza[i])) {
data_FFQ_wide$Pizza[i] <- mean_pizza
}
if (data_FFQ_wide$`rohes Gemüse`[i]< 0 & !is.na(data_FFQ_wide$`rohes Gemüse`[i])) {
data_FFQ_wide$`rohes Gemüse`[i] <- mean_vegetables_raw
}
if (data_FFQ_wide$`Wein/ Sekt`[i]< 0 & !is.na(data_FFQ_wide$`Wein/ Sekt`[i])) {
data_FFQ_wide$`Wein/ Sekt`[i] <- mean_wine
}
if (data_FFQ_wide$Cornflakes[i]< 0 & !is.na(data_FFQ_wide$Cornflakes[i])) {
data_FFQ_wide$Cornflakes[i] <- mean_cornflakes
}
}
sum(data_FFQ_wide$mean_daily_portion_g < 0) ### should be zero
#### re-transfer from corrected wide format to original long format ------
data_FFQ <- data_FFQ_wide %>%
pivot_longer(cols = c(2:54), names_to = "food_group", values_to = "mean_daily_portion")
### QUALITY CHECK (negative mean daily portions) ----------
#### check if all mean daily portions > 0
sum(data_FFQ$mean_daily_portion < 0, na.rm = T) ### should be zero
#### add ffq_item to all tables ------
ffq_item = c()
......@@ -711,7 +573,24 @@ for(i in 1:length(data_FFQ$subject)){
# }
### QUALITY CHECK (energy content plausability) -------
##### calculate energy from protein, carbs and fat and compare to energy value from BLS -----------
### 4 kcal/g from carbs, 4 kcal/g from protein and 9 kcal/g from fat
### source: https://www.nal.usda.gov/programs/fnic#faq--how-many-calories-are-in-
data_FFQ <- add_column(data_FFQ, Energy_kcal_sum_macros=NA, .after="Energy_kcal" )
data_FFQ <- add_column(data_FFQ, Energy_quotient=NA, .after="Energy_kcal_sum_macros" )
data_FFQ$Energy_kcal_sum_macros <- data_FFQ$Protein_g * 4 + data_FFQ$Carbohydrates_g * 4 + data_FFQ$Fat_g * 9
data_FFQ$Energy_quotient <- data_FFQ$Energy_kcal_sum_macros / data_FFQ$Energy_kcal
### check if any food groups have higher or lower energy content based on protein, carbs and fat than calculated with the BLS
### alcoholic drinks have a lower energy content as alcohol is disregarded
### teas have a lower energy content due to small deviations regarding the very low energy content
unique(data_FFQ$food_group[data_FFQ$Energy_quotient < 0.9])
unique(data_FFQ$food_group[data_FFQ$Energy_quotient > 1.05])
##### clean up dataframe -----
data_FFQ <- subset(data_FFQ, select=-c(Energy_kcal_sum_macros, Energy_quotient))
### export final dataframes ------
......@@ -746,7 +625,6 @@ names(nutrients_FFQ)[19] = "Omega3_mg"
names(nutrients_FFQ)[20] = "Omega6_mg"
### add sum of each macronutrient per subject
str(data_FFQ$Energy_kcal)
for(i in 1:length(nutrients_FFQ$subject)){
k=4 ### set to 4 to start from 5 in loop
......@@ -761,16 +639,22 @@ for(i in 1:length(nutrients_FFQ$subject)){
nutrients_FFQ[nutrients_FFQ$Energy_kcal==0,3:20] <- NA
### check for outliers -------
### QUALITY CHECK (detect and remove energy outliers) --------
outliers <- nutrients_FFQ %>%
#group_by(timepoint,intervention) %>%
identify_outliers(Energy_kcal)
outliers
##### remove outliers if desired ---------
##### all outliers
# nutrients_FFQ_cleaned <- nutrients_FFQ[-which(nutrients_FFQ$subject %in% outliers$subject),]
##### only extreme outliers
nutrients_FFQ_cleaned <- nutrients_FFQ[-which(nutrients_FFQ$subject %in% outliers$subject[outliers$is.extreme==TRUE]),]
### add nutrient adjusted for energy with residual method by Willett ----------------
### add nutrient values adjusted for energy with residual method by Willett for statistical analyses ----------------
### (Willett, W. C. & Howe, R. Adjustment for total energy intake in epidemiologic studies. Am J Clin Nutr 65, 1220S–8S (1997).)
##### TODO: change "data = nutrients_FFQ" to nutrients_FFQ_cleaned if outliers removed
fit <- lm(Protein_g ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Protein_g <- residuals(fit) # Save the residual values
......@@ -807,18 +691,54 @@ nutrients_FFQ$residuals_Omega3_mg <- residuals(fit) # Save the residual values
fit <- lm(Omega6_mg ~ Energy_kcal, data = nutrients_FFQ) # fit the model
nutrients_FFQ$residuals_Omega6_mg <- residuals(fit) # Save the residual values
fit <- lm(Protein_g ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Protein_g <- residuals(fit) # Save the residual values
fit <- lm(Fat_g ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Fat_g <- residuals(fit) # Save the residual values
fit <- lm(Carbohydrates_g ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Carbohydrates_g <- residuals(fit) # Save the residual values
fit <- lm(Fibre_g ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Fibre_g <- residuals(fit) # Save the residual values
fit <- lm(Sugar_g ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Sugar_g <- residuals(fit) # Save the residual values
fit <- lm(Cellulose_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Cellulose_mg <- residuals(fit) # Save the residual values
fit <- lm(Lignin_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Lignin_mg <- residuals(fit) # Save the residual values
fit <- lm(Soluble_Fibre_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Soluble_Fibre_mg <- residuals(fit) # Save the residual values
fit <- lm(Insoluble_Fibre_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Insoluble_Fibre_mg <- residuals(fit) # Save the residual values
fit <- lm(Tyrosin_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Tyrosin_mg <- residuals(fit) # Save the residual values
fit <- lm(Tryptophan_mg ~ Energy_kcal, data = nutrients_FFQ_cleaned) # fit the model
nutrients_FFQ_cleaned$residuals_Tryptophan_mg <- residuals(fit) # Save the residual values