from __future__ import division, print_function, unicode_literals # -*- coding: utf-8 -*- """ Created on Mon Nov 25 16:15:47 2019 @author: freiche """ """ A Python script that uses numpy and pyper with R and the "lme4" library to compute relations with linear mixed effects models. Install the "lme4" library with: R -e "install.packages('lme4', repos='http://cran.r-project.org')" """ import difflib import numpy as np import pyper import cran DEFAULT_BS_ITER = 1000 def classify_treatment_repetition(analysis, id_ctl="co", id_trt="", id_ctl_res="", id_trt_res=""): """Convenience method for assigning treatment and repetition This method pairs treatments and repetitions in an analysis using the measurement titles and identifiers given as keyword arguments. Parameters ---------- analysis: shapeout.analysis.Analysis The analysis instance to use. The titles of the individual measurements will be searched for the `id_*` terms. id_ctl: str Identifies a control measurement. id_ctl_res: str Identifies a control measurement in the reservoir. Set to an empty string to disable. id_trt: str Identifies the treatment measurement. Set to an empty string to use all non-control measurements as treatments. id_trt_res: str Identifies the treatment measurement in the reservoir. Must be set if `id_ctl_res` is used. """ # sanity checks if id_ctl == "" and id_trt == "": raise ValueError("At least `id_ctl` or `id_trt` must be set!") idlist = [] for mm in analysis: if mm.config["setup"]["chip region"] == "reservoir": if id_ctl_res and id_ctl_res in mm.title: idlist.append(["res ctl", mm]) elif id_trt_res and id_trt_res in mm.title: idlist.append(["res trt", mm]) elif id_ctl_res == "": idlist.append(["res ctl", mm]) elif id_trt_res == "": idlist.append(["res trt", mm]) else: idlist.append(["none", mm]) else: if id_ctl and id_ctl in mm.title: idlist.append(["ctl", mm]) elif id_trt and id_trt in mm.title: idlist.append(["trt", mm]) elif id_ctl == "": idlist.append(["ctl", mm]) elif id_trt == "": idlist.append(["trt", mm]) else: idlist.append(["none", mm]) # extract and rename treatment treatment = [tt for (tt, mm) in idlist] treatment = [tt.replace("res", "Reservoir") for tt in treatment] treatment = [tt.replace("ctl", "Control") for tt in treatment] treatment = [tt.replace("trt", "Treatment") for tt in treatment] treatment = [tt.replace("none", "None") for tt in treatment] assert len(treatment) == len(analysis) # identify timeunit via similarity analysis ctl_str = [mm.title if tt == "ctl" else "" for (tt, mm) in idlist] ctl_r_str = [mm.title if tt == "res ctl" else "" for (tt, mm) in idlist] trt_str = [mm.title if tt == "trt" else "" for (tt, mm) in idlist ] trt_r_str = [mm.title if tt == "res trt" else "" for (tt, mm) in idlist] matchids = match_similar_strings(ctl_str, trt_str, ctl_r_str, trt_r_str) timeunit = np.zeros(len(analysis)) for ii, match in enumerate(matchids): timeunit[match[0]] = ii+1 timeunit[match[1]] = ii+1 if id_ctl_res or id_trt_res: timeunit[match[2]] = ii+1 timeunit[match[3]] = ii+1 # Set all non-paired treatments to "None" for ii, tu in enumerate(timeunit): if tu == 0: treatment[ii] = "None" return treatment, timeunit def match_similar_strings(a, b, c, d): """Similarity analysis to identify string-matches in four lists Given four lists of strings a, b, c, and d. Find the strings that match best using similarity analysis and return the matching list IDs with highest similarity first. Empty strings are ignored. For instance, the lists a = ["peter", "hans", "", "golf"] b = ["gogo", "ham", "freddy", ""] c = ["red", "gans", "", "hugo"] d = ["old", "futur", "erst", "ha"] will return the following match IDs: [1, 1, 1, 3] [3, 0, 3, 0] [0, 2, 0, 2] which means that these words are similar: ["hans", "ham", "gans", "ha"] ["golf", "gogo", "hugo", "old"] ["peter", "freddy", "red", "erst"] """ ratio = lambda x, y: difflib.SequenceMatcher(a=x, b=y).ratio() n = len(a) assert len(a) == len(b) == len(c) == len(d) # build up simliarity matrix smat = np.zeros((n, n, n, n)) for ii in range(n): for jj in range(n): if a[ii] and b[jj]: ratij = ratio(a[ii], b[jj]) else: ratij = 0 for kk in range(n): if a[ii] and c[kk]: ratik = ratio(a[ii], c[kk]) else: ratik = 0 for ll in range(n): if a[ii] and d[ll]: ratil = ratio(a[ii], d[ll]) else: ratil = 0 smat[ii, jj, kk, ll] = ratij + ratik + ratil # match with maxima matchids = [] for _ in range(n): if np.max(smat) == 0: break ai, aj, ak, al = np.argwhere(smat==smat.max())[0] matchids.append([ai, aj, ak, al]) smat[ai, :, :, :] = 0 smat[:, aj, :, :] = 0 smat[:, :, ak, :] = 0 smat[:, :, :, al] = 0 return matchids def diffdef(y, yR, bs_iter=DEFAULT_BS_ITER, rs=117): """ Computes bootstrapped median distributions of same size for two distributions of different size. Parameters ---------- y: 1d ndarray of length N Channel data yR: 1d ndarray of length M Reservoir data bs_iter: int Number of bootstrapping iterations to perform rs: int Random state seed for random number generator Returns ------- median: nd array of shape (bs_iter, 1) Boostrap distribution of medians of y median_r: nd array of shape (bs_iter, 1) Boostrap distribution of medians of yR """ # Convert to arrays y = np.array(y) yR = np.array(yR) # Seed random numbers that are reproducible on different machines prng_object = np.random.RandomState(rs) # Initialize median arrays Median = np.zeros([bs_iter, 1]) MedianR = np.zeros([bs_iter, 1]) # If this loop is still too slow, we could get rid of it and # do everything with arrays. Depends on whether we will # eventually run into memory problems with array sizes # of y*bs_iter and yR*bs_iter. for q in range(bs_iter): # Channel data: # Compute random indices and draw from y draw_y_idx = prng_object.randint(0, len(y), len(y)) y_resample = y[draw_y_idx] Median[q, 0] = np.nanmedian(y_resample) # Reservoir data # Compute random indices and draw from yR draw_yR_idx = prng_object.randint(0, len(yR), len(yR)) yR_resample = yR[draw_yR_idx] MedianR[q, 0] = np.nanmedian(yR_resample) return [Median, MedianR] def linmixmod(xs, treatment, timeunit, model='lmm', RCMD=cran.rcmd): ''' Linear Mixed-Effects Model computation for one fixed effect and one random effect. This function uses the R packages "lme4" and "stats". The response variable is modeled using two linear mixed effect models (Model and Nullmodel) of the form: - xs~treatment+(1+treatment|timeunit) (Random intercept + random slope model) - xs~(1+treatment|timeunit) (Nullmodel without the fixed effect "treatment") Both models are compared in R using "anova" (from the R-package "stats") which performs a likelihood ratio test to obtain the p-Value for the significance of the fixed effect (treatment). Optionally differential deformations are computed which are then used in the Linear Mixed Model Parameters ---------- xs: list of multiple 1D ndarrays Each index of `xs` contains an array of response variables. (eg. list containing "area_um" data of several measurements) treatment: list Each item is a description/identifier for a treatment. The enumeration matches the index of `xs`. treatment[i] can be 'Control', 'Treatment', 'Reservoir Control' or 'Reservoir Treatment'. If 'Reservoir ...' is chosen, the algorithm will perform a bootstrapping algorithm that removes the median from each Channel measurement. That means for each 'Control' or 'Treatment' has to exist a 'Reservoir ...' measurement. The resulting Differential deformations are then used in the Linear Mixed Model. Values of 'None' are excluded from the analysis. timeunit: list Each item is a description/identifier for a time. The enumeration matches the index of `xs`. (e.g. list containing integers "1" and "2" according to the day at which the content in `xs` was measured) Values of '0' are excluded from the analysis. model: string 'lmm': A linear mixed model will be applied 'glmm': A generalized linear mixed model will be applied Returns ------- (Generalized) Linear Mixed Effects Model Result: dictionary The dictionary contains: -Estimate: the average value of cells that had Treatment 1 -Fixed Effect: Change of the estimate value due to the Treatment 2 -Std Error for the Estimate -Std Error for the Fixed Effect -p-Value References ---------- .. [1] R package "lme4": Bates D, Maechler M, Bolker B and Walker S (2015). lme4: Linear mixed- effects models using Eigen and S4. R package version 1.1-9, https://CRAN.R-project.org/package=lme4. .. [2] R function "anova" from package "stats": Chambers, J. M. and Hastie, T. J. (1992) Statistical Models in S, Wadsworth & Brooks/Cole Examples ------- import numpy as np import pyper from nptdms import TdmsFile import os xs = [ [100,99,80,120,140,150,100,100,110,111,140,145], #Larger values (Channel1) [20,10,5,16,14,22,27,26,5,10,11,8,15,17,20,9], #Smaller values (Reservoir1) [115,110,90,110,145,155,110,120,115,120,120,150,100,90,100], #Larger values (Channel2) [30,30,15,26,24,32,37,36,15,20,21,18,25,27,30,19], #Smaller values (Reservoir2) [150,150,130,170,190,250,150,150,160,161,180,195,130,120,125,130,125], [2,1,5,6,4,2,7,6,5,10,1,8,5,7,2,9,11,8,13], [155,155,135,175,195,255,155,155,165,165,185, 200,135,125,130,135,140,150,135,140], [25,15,19,26,44,42,35,20,15,10,11,28,35,10,25,13]] treatment1 = ['Control', 'Reservoir Control', 'Control', 'Reservoir Control',\ 'Treatment', 'Reservoir Treatment','Treatment', 'Reservoir Treatment'] timeunit1 = [1, 1, 2, 2, 1, 1, 2, 2] #Example 1: linear mixed models on differential deformations Result_1 = linmixmod(xs=xs,treatment=treatment1,timeunit=timeunit1,model='lmm') #Result_1:Estimate=93.69375 (i.e. the average Control value is 93.69) # FixedEffect=43.93 (i.e. The treatment leads to an increase) # p-Value(Likelihood Ratio Test)=0.0006026 (i.e. the increase is significant) #Example 2: Ordinary Linear mixed models #'Reservoir' measurements are now Controls #'Channel' measurements are Treatments #This does not use differential deformation in linmixmod() treatment2 = ['Treatment', 'Control', 'Treatment', 'Control',\ 'Treatment', 'Control','Treatment', 'Control'] timeunit2 = [1, 1, 2, 2, 3, 3, 4, 4] Result_2 = linmixmod(xs=xs,treatment=treatment2,timeunit=timeunit2,model='lmm') #Result_2:Estimate=17.17 (i.e. the average Control value is 17.17 ) # FixedEffect=120.257 (i.e. The treatment leads to an increase) # p-Value(Likelihood Ratio Test)=0.00033 (i.e. the deformation # increases significantly) #Example 3: Generalized Linear mixed models treatment3 = ['Treatment', 'Control', 'Treatment', 'Control',\ 'Treatment', 'Control','Treatment', 'Control'] timeunit3 = [1, 1, 2, 2, 3, 3, 4, 4] Result_3 = linmixmod(xs=xs,treatment=treatment3,timeunit=timeunit3,model='glmm') #Result_3:Estimate=2.71 (i.e. the average Control value is exp(2.71)=15.08) # FixedEffect=2.19 (i.e. The treatment leads to an increase) # p-Value(Likelihood Ratio Test)=0.00366 (i.e. the deformation # increases significantly) ''' modelfunc = "xs~treatment+(1+treatment|timeunit)" nullmodelfunc = "xs~(1+treatment|timeunit)" # Check if all input lists have the same length if len(xs) != len(treatment) or len(xs) != len(timeunit): msg = "`treatment` and `timeunit` not defined for all variables!" raise ValueError(msg) if len(xs) < 3: msg = "Linear Mixed Models require repeated measurements. " +\ "Please select more treatment repetitions." raise ValueError(msg) # Check that names are valid for trt in treatment: if trt not in ["None", "Control", "Reservoir Control", "Treatment", "Reservoir Treatment"]: raise ValueError("Unknown treatment: '{}'".format(trt)) # Remove "None"s and "0"s treatment = np.array(treatment) timeunit = np.array(timeunit) xs = np.array(xs) invalid = np.logical_or(treatment == "None", timeunit == 0) treatment = list(treatment[~invalid]) timeunit = list(timeunit[~invalid]) xs = [xi for ii, xi in enumerate(xs) if ~invalid[ii]] # convert to ndarray xs = [np.array(xi, dtype=float) for xi in xs] # remove nan/inf values xs = [xi[~np.logical_or(np.isnan(xi), np.isinf(xi))] for xi in xs] ######################Differential Deformation############################# # If the user selected 'Control-Reservoir' and/or 'Treatment-Reservoir' Median_DiffDef = [] TimeUnit, Treatment = [], [] if 'Reservoir Control' in treatment or 'Reservoir Treatment' in treatment: if model == 'glmm': Head_string = "GENERALIZED LINEAR MIXED MODEL ON BOOTSTAP-DISTRIBUTIONS: \n" +\ "---Results are in log space (loglink was used)--- \n" if model == 'lmm': Head_string = "LINEAR MIXED MODEL ON BOOTSTAP-DISTRIBUTIONS: \n" # Find the timeunits for Control where_contr_ch = np.where(np.array(treatment) == 'Control') timeunit_contr_ch = np.array(timeunit)[where_contr_ch] # Find the timeunits for Treatment where_treat_ch = np.where(np.array(treatment) == 'Treatment') timeunit_treat_ch = np.array(timeunit)[where_treat_ch] for n in np.unique(timeunit_contr_ch): where_time = np.where(np.array(timeunit) == n) xs_n = np.array(xs)[where_time] treatment_n = np.array(treatment)[where_time] where_contr_ch = np.where(np.array(treatment_n) == 'Control') xs_n_contr_ch = xs_n[where_contr_ch] where_contr_res = np.where( np.array(treatment_n) == 'Reservoir Control') xs_n_contr_res = xs_n[where_contr_res] # check that corresponding Controls are selected if (len(where_contr_ch[0]) != 1 or len(where_contr_res[0]) != 1): msg = "Controls for channel and reservoir must be given" \ +" exactly once (repetition {})!".format(n) raise ValueError(msg) # Apply the Bootstraping algorithm to Controls y = np.array(xs_n_contr_ch)[0] yR = np.array(xs_n_contr_res)[0] [Median, MedianR] = diffdef(y, yR) Median_DiffDef.append(Median - MedianR) # TimeUnit is a number for the day or the number of the repeat TimeUnit.extend(np.array(n).repeat(len(Median))) Treatment.extend(np.array(['Control']).repeat(len(Median))) for n in np.unique(timeunit_treat_ch): where_time = np.where(np.array(timeunit) == n) xs_n = np.array(xs)[where_time] treatment_n = np.array(treatment)[where_time] xs_n_contr_res = xs_n[where_contr_res] where_treat_ch = np.where(np.array(treatment_n) == 'Treatment') xs_n_treat_ch = xs_n[where_treat_ch] where_treat_res = np.where( np.array(treatment_n) == 'Reservoir Treatment') xs_n_treat_res = xs_n[where_treat_res] # check that corresponding Treatments are selected if (len(where_treat_ch[0]) != 1 or len(where_treat_res[0]) != 1): msg = "Treatments for channel and reservoir must be given" \ +" exactly once (repetition {})!".format(n) raise ValueError(msg) # Apply the Bootstraping algorithm to Treatments y = np.array(xs_n_treat_ch)[0] yR = np.array(xs_n_treat_res)[0] [Median, MedianR] = diffdef(y, yR) Median_DiffDef.append(Median - MedianR) # TimeUnit is a number for the day or the number of the repeat TimeUnit.extend(np.array(n).repeat(len(Median))) Treatment.extend(np.array(['Treatment']).repeat(len(Median))) # Concat all elements in the lists xs = np.concatenate(Median_DiffDef) xs = np.array(xs).ravel() treatment = np.array(Treatment) timeunit = np.array(TimeUnit) else: # If there is no 'Reservoir Channel' selected don't apply bootstrapping if model == 'glmm': Head_string = "GENERALIZED LINEAR MIXED MODEL: \n" +\ "---Results are in log space (loglink was used)--- \n" if model == 'lmm': Head_string = "LINEAR MIXED MODEL: \n" for i in range(len(xs)): # Expand every unit in treatment and timeunit to the same length as the # xs[i] they are supposed to describe # Using the "repeat" function also characters can be handled treatment[i] = np.array([treatment[i]]).repeat(len(xs[i]), axis=0) timeunit[i] = np.array([timeunit[i]]).repeat(len(xs[i]), axis=0) # Concat all elements in the lists xs = np.concatenate(xs) treatment = np.concatenate(treatment) timeunit = np.concatenate(timeunit) # Open a pyper instance r1 = pyper.R(RCMD=RCMD) # try to fix unicode decode errors by forcing english r1('Sys.setenv(LANG = "en")') r1.assign("xs", xs) # Transfer the vectors to R r1.assign("treatment", treatment) r1.assign("timeunit", timeunit) # Create a dataframe which contains all the data r1("RTDC=data.frame(xs,treatment,timeunit)") # Load the necessary library for Linear Mixed Models lme4resp = r1("library(lme4)")#.decode("utf-8") if lme4resp.count("Error"): # Tell the user that something went wrong raise OSError("R installation at {}: {}\n".format(RCMD, lme4resp) + """Please install 'lme4' via: {} -e "install.packages('lme4', repos='http://cran.r-project.org') """.format(RCMD) ) # Random intercept and random slope model if model == 'glmm': r1("Model = glmer(" + modelfunc + ",RTDC,family=Gamma(link='log'))") r1("NullModel = glmer(" + nullmodelfunc + ",RTDC,family=Gamma(link='log'))") if model == 'lmm': r1("Model = lmer(" + modelfunc + ",RTDC)") r1("NullModel = lmer(" + nullmodelfunc + ",RTDC)") r1("Anova = anova(Model,NullModel)") # Model_string = r1("summary(Model)").decode("utf-8").split("\n", 1)[1] # Anova_string = r1("Anova").decode("utf-8").split("\n", 1)[1] # Coef_string = r1("coef(Model)").decode("utf-8").split("\n", 2)[2] Model_string = r1("summary(Model)").split("\n", 1)[1] Anova_string = r1("Anova").split("\n", 1)[1] Coef_string = r1("coef(Model)").split("\n", 2)[2] # Cleanup output Coef_string = Coef_string.replace('attr(,"class")\n', '') Coef_string = Coef_string.replace('[1] "coef.mer"\n', '') #"anova" from R does a likelihood ratio test which gives a p-Value p = np.array(r1.get("Anova$Pr[2]")) # Obtain p-Value using a normal approximation # Extract coefficients r1("coefs <- data.frame(coef(summary(Model)))") r1("coefs$p.normal=2*(1-pnorm(abs(coefs$t.value)))") # Convert to array, depending on platform or R version, this is a DataFrame # or a numpy array, so we convert it to an array. Because on Windows the # result is an array with subarrays of type np.void, we must access the # elements with Coeffs[0][0] instead of Coeffs[0,0]. Coeffs = np.array(r1.get("coefs")) # The Average value of treatment 1 Estimate = Coeffs[0][0] # The Std Error of the average value of treatment 1 StdErrorEstimate = Coeffs[0][1] # treatment 2 leads to a change of the Estimate by the value "FixedEffect" FixedEffect = Coeffs[1][0] StdErrorFixEffect = Coeffs[1][1] # Before getting effect and error for y, transform back (there happened a log transformation in the glmer) estim_y = np.exp(Estimate) #estim_y_error = abs(np.exp(Estimate+StdErrorEstimate)-np.exp(Estimate-StdErrorEstimate)) fixef_y = np.exp(Estimate + FixedEffect) - np.exp(Estimate) #fixef_y_error = abs(np.exp(Estimate+StdErrorFixEffect)-np.exp(Estimate-StdErrorFixEffect)) full_summary = Head_string + Model_string +\ "\nCOEFFICIENT TABLE:\n" + Coef_string +\ "\nLIKELIHOOD RATIO TEST (MODEL VS. NULLMODEL): \n" +\ Anova_string if model == "glmm": full_summary += "\nESTIMATE AND EFFECT TRANSFORMED BACK FROM LOGSPACE" +\ "\nEstimate = \t" + str(estim_y) +\ "\nFixed effect = \t" + str(fixef_y) results = {"Full Summary": full_summary, "p-Value (Likelihood Ratio Test)": p, "Estimate": Estimate, "Std. Error (Estimate)": StdErrorEstimate, "Fixed Effect": FixedEffect, "Std. Error (Fixed Effect)": StdErrorFixEffect} return results