Commit 294b8f53 authored by daniel.eilertz's avatar daniel.eilertz
Browse files

Merge branch 'eilertz' into 'main'

Added functions for peak picking, isotope ratio calculation | First draft of function to handle mass/abundance of isotope adducts

See merge request !4
parents 00fe9ec2 6a2fa043
FROM python:3.9
RUN apt-get update
#CV 2 dependencies for findpeaks package
RUN apt-get install ffmpeg libsm6 libxext6 -y
RUN apt-get install nano
ADD . /code/
WORKDIR /code
# set env to use autoMDVhr as package
ENV PYTHONPATH "/code/autoMDVhr"
# Get pip and install packages & wheels
RUN curl https://bootstrap.pypa.io/get-pip.py -o setup/get-pip.py
RUN python3 setup/get-pip.py
#RUN pip install --upgrade pip
RUN pip3 install --upgrade pip
RUN pip3 install matplotlib
RUN pip3 install matplotlib==3.5.2
RUN pip3 install numpy==1.23.1
......@@ -21,5 +24,8 @@ RUN pip3 install pandas==1.4.3
RUN pip3 install plotly==5.9.0
RUN pip3 install dill
RUN pip3 install tqdm
RUN pip3 install opencv-python
RUN pip3 install findpeaks
RUN pip3 install seaborn
RUN pip3 install setup/pyopenms-2.7.0-cp39-cp39-manylinux2014_x86_64.whl
from pyopenms import *
from pyopenms.Constants import *
import numpy as np
import pandas as pd
def calc_iso_pattern(formula, adduct, verbose=False, percentages=True):
'''
Description
-----------
TODO: Build data structure to handle chemical stuff for metabolites
TODO: Update analyste_list.xlsx or parse current adduct info
TODO: Handle Electron Mass when proton is added/removed from formula
TODO: Handle positive/negative charge
Parameters
----------
Returns
-------
'''
#Glutamine (C5H9N2O3)
#Glutamine (C5H10N2O3)
#formula = "C5H10N2O3"
#adduct = "H-1"
#charge="-"
#Glutamine (C5H9N2O3)
#impact 2 Bruker
# 4 Kommastellen
#145.06185256
# C13C12_MASSDIFF_U # C13 mass diff
emp_form = EmpiricalFormula(formula)#,charge=-1
add_form = EmpiricalFormula(adduct)
emp_form = emp_form + add_form
iso_dist = {"isotopolopgue": [], "abundance": []}
# print("Coarse Isotope Distribution:")
isotopes = emp_form.getIsotopeDistribution( CoarseIsotopePatternGenerator(6) )
prob_sum = sum([iso.getIntensity() for iso in isotopes.getContainer()])
if verbose:
print("This covers", prob_sum, "probability")
for iso in isotopes.getContainer():
#!!!!
#!!!!
#a.getMZ() + ELECTRON_MASS_U
iso_dist["isotopolopgue"].append(iso.getMZ())
iso_dist["abundance"].append((iso.getIntensity() * 100))
if verbose:
print("Isotope", iso.getMZ(), "has abundance", iso.getIntensity()*100, "%")
if percentages:
iso_dist = np.array(list(iso_dist.values())[1])/(list(iso_dist.values())[1][0]/100)
return iso_dist
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
def compare_iso_ratio(uniIntens,test_abu,plot=True):
'''
Distance bewtween isopope pattern vectors for given intensity arrays and and a reference ratio of isopopes
Description
-----------
compare_iso_ratio calculates the distance between given isopope intensity ratios and
a given reference ratio. The isoptope intensity ratios are calculated based on isotopologue
intensity arrays. The reference ratio must be a numpy array with the length of the
respective number of isoptope patterns for the given metabolite.
TODO Envipat
Parameters
----------
uniIntens: 3 dimensional array holding with intensities for each isotope.
test_abu: 1 dimensional array with reference isotope abundances [%].
Returns
-------
ratio_distance: 2 dimensional array with distances between each isotope ratio and the reference.
Examples
--------
>>> # Example of ratio calculation
>>> x = np.array([[10,6,4],[8,6,2],[12,6,0]])
>>> y = np.array([[5,4,2],[4,2,4],[2,3,2]])
>>> z = np.array([[0,2,2],[8,2,2],[2,2,2]])
>>> a = np.array([x,y,z])
>>> b = [100,50,20]
>>> #Calculate percentages based on h[0] ref
>>> h = a / (a[0]/100)
>>> # Calculate differnce between ratio vector and atacked arrays
>>> t = (h.transpose() - b).transpose()
>>> # Calculate Vector distance
>>> r = np.sqrt(np.sum(np.square(t),axis=0))
'''
# #Calculate percentages based on h[0] ref
ratio_uniIntens = uniIntens / (uniIntens[0]/100)
# Calculate difference between isotope ratios and theoretical ratio
ratio_diff = (ratio_uniIntens.transpose() - test_abu).transpose()
#Calucalte vector distance of differences
ratio_vector = np.sqrt(np.sum(np.square(ratio_diff),axis=0))
#ratio_vector[ratio_vector >= 1] = numpy.nan
# Plot log image with spectral colorbar
if plot:
plt.imshow(ratio_vector,
cmap='Spectral',
origin = 'lower',
aspect = 'auto',
norm=LogNorm(
vmin=np.nanmin(ratio_vector),
vmax=np.nanmax(ratio_vector)
)
)
plt.colorbar()
return ratio_vector
\ No newline at end of file
import numpy as np
import pandas as pd
def compare_iso_ratio_peaks(peaks_pers,uniIntens,test_abu):
'''
Distance bewtween isopope pattern vectors for given peaks and and a reference ratio of isopopes
Description
-----------
compare_iso_ratio_peaks calculates the distance between given isopope intensity ratios and
a given reference ratio. The isoptope intensity ratios are calculated based on peaks, which have been
detected with the findpeaks module. The reference ratio must be a numpy array with the length of
the respective number of isoptope patterns for the given metabolite.
TODO Envipat
Parameters
----------
peaks_pers: pandas dataframe with findpeaks' detected peaks.
uniIntens: 3 dimensional array holding the intensities for each isotope.
test_abu: 1 dimensional array with reference isotope abundances [%].
Returns
-------
ratio_distance_peaks: 1 dimensional array with distances between each isotope ratio and the reference.
'''
# Set rel isotope abundances for glutamine
#test_abu = np.array([100,5.407864,0.11698,0.001265,0.000007,0.000001]) # glutamine based on envipat
# Get all peaks' xy
peaks_xy_full = np.array(peaks_pers["persistence"][["x","y"]])
peak_intens = np.zeros((len(peaks_xy_full),np.shape(uniIntens)[0]))
ratio_distance_peaks = np.zeros(len(peaks_xy_full))
# Caluculate vectoe difference for each peak
for i in range(0,len(peaks_xy_full)):
# Create mask for peak
mask = np.full(np.shape(uniIntens[0]),False)
mask[peaks_xy_full[i,0],peaks_xy_full[i,1]] = True
# Get intensities for stacked peaks' xy
peak_intens[i,:] = np.array([uniIntens[x][mask] for x in range(0,np.shape(uniIntens)[0],1)]).flatten()
# Normalize abundances to M+0 as 100 percent
rel_abu_vector = peak_intens[i,:]/(peak_intens[i,:][0]/100)
# Calculate vector distance with dummies
ratio_distance_peaks[i] = np.sqrt(np.sum(np.square(test_abu-rel_abu_vector)))
return ratio_distance_peaks
\ No newline at end of file
import matplotlib.pyplot as plt
import numpy as np
from findpeaks import findpeaks
import seaborn as sns
def find_peaks_persistence(array, stack=True, normalize=True, plot=True):
'''
Description
-----------
Parameters
----------
Returns
-------
'''
# array = uniIntens[0]
# stack = True
# normalize = True
# plot = True
if stack:
# Stack arrays of isotopologues
uniIntensStack = np.stack(array,axis=0)
# Calculate stats for
uniIntensStackSum = np.sum(uniIntensStack,axis=0).T
else:
uniIntensStackSum = array.T
# Normlize raw intensities
if normalize:
image = ((uniIntensStackSum - np.min(uniIntensStackSum)) / (np.max(uniIntensStackSum) - np.min(uniIntensStackSum))) * 100
else:
image = uniIntensStackSum
#plot distribution of intensities
# plt.hist(image.flatten(), color = 'blue', bins = 'auto')
# plt.show()
#boxplot of intensities
# sns.boxplot(image.flatten())
# percentiles of intensities
#perc = np.percentile(image.flatten(),np.array(range(10,100,1)))
# Sort Intensities
sort_img = np.sort(image.flatten())
# Calc slope of sorted intensities
diff_sort_img = np.diff(sort_img)
# Get slope threshold index
lower_slope_thresh = np.min(np.where(diff_sort_img > 0.5))
lower_slope_thresh_2 = np.min(np.where(diff_sort_img > 1))
lower_slope_thresh_3 = np.min(np.where(diff_sort_img > 2))
# get intensity threshold index (not needed)
lower_thresh = np.where(image == sort_img[lower_slope_thresh])
min_limit = image[lower_thresh[0],lower_thresh[1]]
# Initialize findpeaks class
# fp = findpeaks(method='topology', scale=True,limit = min_limit, denoise='fastnl', window=3, togray=True, imsize=(50,150),verbose=5)
# fp = findpeaks(method='topology', scale=True, limit = min_limit, denoise='fastnl', window=3, togray=True,verbose=5)
fp = findpeaks(method='topology', scale=True, limit = min_limit, denoise='fastnl', window=3, togray=True,verbose=5)
# fp = findpeaks(method='topology', scale=True, window=3, togray=True,verbose=5)
# set data to analyze
#X = fp.import_example("2dpeaks")
X= image
# Detect peaks
results = fp.fit(X)
#get real peaks
#peaks_x_y = results['persistence'][["x","y"]].loc[results['persistence']['peak']==True]
if plot:
# plot sorted intensities
# plt.plot(range(0,np.shape(sort_img)[0],1),sort_img)
# plt.show()
#np.where(sort_img > perc[80])
# plot derivates of sorted intensities
# plt.plot(range(0,np.shape(diff_sort_img)[0],1),diff_sort_img)
# plt.show()
# plot specific range of derivates of sorted intensities with preset thersholds
range_min = np.max([0,lower_slope_thresh - 200])
plt.plot(range(range_min,np.shape(diff_sort_img)[0],1),diff_sort_img[range_min:])
plt.axhline(y = 0.5, color = 'b', linestyle = '-')
plt.annotate('∆: 0.5', xy=(range_min, 0.5), fontsize=6,color='b') #xytext=(4, 10),
plt.axhline(y = 1, color = 'g', linestyle = '-')
plt.annotate('∆: 1', xy=(range_min, 1), fontsize=6,color='g') #xytext=(4, 10),
plt.axhline(y = 2, color = 'r', linestyle = '-')
plt.annotate('∆: 2', xy=(range_min, 2), fontsize=6,color='r') #xytext=(4, 10),
plt.show()
# plot sorted intensities with treshold
max_y = np.max(sort_img)
plt.plot(range(range_min,np.shape(sort_img)[0],1),sort_img[range_min:])
plt.axvline(x=lower_slope_thresh, color='blue',linestyle='-')
plt.annotate('∆: 0.5', xy=(lower_slope_thresh, max_y), fontsize=6,color='b')
plt.axvline(x=lower_slope_thresh_2, color='green',linestyle='-')
plt.annotate('∆: 1', xy=(lower_slope_thresh_2, max_y), fontsize=6,color='g')
plt.axvline(x=lower_slope_thresh_3, color='red',linestyle='-')
plt.annotate('∆: 2', xy=(lower_slope_thresh_3, max_y), fontsize=6,color='r')
plt.show()
# findpeaks preprocessing plots
# NOTE Preprocessing plots are not shown, most likely due to error with cv2
# fp.plot_preprocessing()
# findpeaks result plots
# NOTE plots cannot be saved to file
fp.plot()
fp.plot_persistence()
fp.plot_mesh()
fp.plot_mesh(view=(90,0))
return results
\ No newline at end of file
......@@ -6,9 +6,9 @@ from autoMDVhr.find_peaks import *
def plot_peaks(uniIntens):
# Stack arrays of isotopologues and sum Intensities
# Stack arrays of isotopologues and sum Intensities (transpose summed array)
uniIntensStack = np.stack(uniIntens,axis=0)
uniIntensStackSum = np.sum(uniIntensStack,axis=0)
uniIntensStackSum = np.sum(uniIntensStack,axis=0).T
# get xy dimensions
x_dim = np.shape(uniIntensStackSum)[1]
......@@ -21,7 +21,7 @@ def plot_peaks(uniIntens):
XX,YY=np.meshgrid(xx,yy)
# Transform data to speed up peak picking
step = 0.02
step = 0.01
zmax = uniIntensStackSum.max() / uniIntensStackSum.max()
data = uniIntensStackSum / uniIntensStackSum.max()
......@@ -96,4 +96,7 @@ def plot_peaks(uniIntens):
plt.show(block=False)
plt.savefig("peak_overview.png")
\ No newline at end of file
plt.savefig("peak_overview.png")
plt.close()
return peaks
\ No newline at end of file
......@@ -11,16 +11,16 @@ def plot_stacked_isotopologues(uniIntens):
uniIntensStack = np.stack(uniIntens,axis=0)
# Calculate stats for
uniIntensStackSum = np.sum(uniIntensStack,axis=0)
uniIntensStackMean = np.mean(uniIntensStack,axis=0)
uniIntensStackSd = np.std(uniIntensStack,axis=0)
uniIntensStackSum = np.sum(uniIntensStack,axis=0).T
uniIntensStackMean = np.mean(uniIntensStack,axis=0).T
uniIntensStackSd = np.std(uniIntensStack,axis=0).T
uniIntensStackVarCof = uniIntensStackSd / uniIntensStackMean
# Set up dict for data and text annotations
data = {
"sum" : uniIntensStackSum,
"log(sum)" : np.log(uniIntensStackSum),
"log(sum)" : np.log(np.log(uniIntensStackSum)),
"square(sum)" : np.square(uniIntensStackSum),
"mean(sum)" : uniIntensStackMean,
"sd(sum)" : uniIntensStackSd,
......@@ -41,11 +41,12 @@ def plot_stacked_isotopologues(uniIntens):
fig.suptitle('Summed intensities for aggregated isotopologues', fontsize=15)
ax.set_xlabel('RT units', fontsize=10)
ax.set_ylabel('Stacked MZ units', fontsize=10)
ax.set_xticks(range(9,np.shape(list(data.values())[ida])[1],10))
ax.set_xticklabels(range(10,np.shape(list(data.values())[ida])[1]+10,10))
#ax.set_xticks(range(9,np.shape(list(data.values())[ida])[1],10))
#ax.set_xticklabels(range(10,np.shape(list(data.values())[ida])[1]+10,10))
ax.text(33,50,list(data)[ida],color="white",weight="bold")
plt.savefig("Stacked_isotopologues_intensities.png")
plt.close()
plt.show()
# plt.savefig("Stacked_isotopologues_intensities_1.png")
# plt.close()
......
......@@ -10,16 +10,23 @@ import pathlib
import os
import sqlite3, dill
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
from datetime import datetime
import pandas as pd
import argparse
from pyopenms import *
from pyopenms.Constants import *
from pyparsing import col
from autoMDVhr import *
from autoMDVhr.find_peaks import *
from autoMDVhr.plot_stacked_isotopologues import *
from autoMDVhr.plot_isotopologues import *
from autoMDVhr.plot_peaks import *
from autoMDVhr.find_peaks_persistence import *
from autoMDVhr.compare_iso_ratio_peaks import *
from autoMDVhr.compare_iso_ratio import *
from autoMDVhr.calc_iso_pattern import *
import pickle as pk
# Parse arguments
......@@ -28,7 +35,7 @@ parser.add_argument("--folder", help="Folder with .D files")
parser.add_argument("--file", help="Folder with .D file")
parser.add_argument("--chrom", help="Chromatography options: 'polar', 'lipid', 'soso', 'polar20', 'polar21' ")
parser.add_argument("--specType", help="Spectrum type: 'profile', 'line' ")
args=parser.parse_args()
args=parser.parse_args("")
# TODO: Handling of folder with dfiles
# args.file = "/code/data/00067856A549_12C_4_P2b1_1_1117.d"
# args.file = "/code/data/00067858A549_13C_d4_P2b3_1_1119.d"
......@@ -103,9 +110,15 @@ unimz = np.round(np.arange(glob_var["mzrange"][0],
# rtStart = 90
# rtEnd = 130
# # RT an mass list for Glutamine (C5H10N2O3). Should come from analyte list later on
<<<<<<< autoMDVhr/run_autoMDVhr.py
isotopologues = 145.0618654 + np.arange(-1,6) * glob_var["add13c"]
rtStart = 600
rtEnd = 630
=======
# isotopologues = 145.0618654 + np.arange(-1,5) * glob_var["add13c"]
# rtStart = 600
# rtEnd = 630
>>>>>>> autoMDVhr/run_autoMDVhr.py
# # RT an mass list for LPE 16:0 (C21H44NO7P). Should come from analyte list later on
# isotopologues = 454.292816 + np.arange(-1,21) * glob_var["add13c"]
......@@ -157,8 +170,26 @@ start_time = datetime.now()
plot_isotopologues(uniMzList, rt_real, uniIntens, isotopologues, filename = "test.pdf")
print("plot_isotopologues: ", str(datetime.now() - start_time))
# get peaks by contour method
peaks = plot_peaks(uniIntens)
plot_peaks(uniIntens)
# plot summed 2D aggregated data for all isotopologues
plot_stacked_isotopologues(uniIntens)
# Find peaks based on persistence
peaks_pers = find_peaks_persistence(uniIntens)
# Get all peaks' xy
peaks_xy_full = np.array(peaks_pers["persistence"][["x","y"]])
# Set rel isotope abundances for glutamine
test_abu1= np.array([100,5.407864,0.11698,0.001265,0.000007,0.000001]) # glutamine based on envipat
test_abu = calc_iso_pattern("C5H10N2O3","H-1", charge="-", verbose=True, percentages=True)
compare_iso_ratio_peaks(peaks_pers,uniIntens[1:],test_abu1)
compare_iso_ratio(uniIntens[1:],test_abu1,plot=True)
bk = {}
for k in ['uniMzList', 'uniIntens', 'rt_real', 'isotopologues']: # alternatively, dir() for saving everything
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment