Commit 4c7ee89f authored by aditya.bhagwat's avatar aditya.bhagwat
Browse files

Make doench2016() robust for larger datasets (split-apply-combine)

parent 18313f16
Pipeline #126751 passed with stages
in 37 seconds
......@@ -66,12 +66,19 @@ doench2014 <- function(
magrittr::extract(, 1)
}
# Note: mclapply vs. bplapply
# I first used BiocParallel::bplapply().
# That works great on linux, but it fails on windows.
# support.bioconductor.org/p/92587/ explains likely why
# Windows doensn't allow forking (where a child process inherits parent env)
# Instead bplapply defaults to multithreading.
# In that scenario, the reticulation env is not correctly passed.
# mclapply seems like best choice: fork on linux - execute serially on win
doench2016 <- function(
contextseqs,
verbose = TRUE
chunksize = 10000,
verbose = TRUE
){
# Assert
is_identical_to_true(reticulate::py_module_available('azimuth'))
assert_is_character(contextseqs)
......@@ -84,16 +91,20 @@ doench2016 <- function(
# Score
azi <- reticulate::import('azimuth.model_comparison', delay_load = TRUE)
azi$predict(
reticulate::np_array(contextseqs),
aa_cut = NULL,
percent_peptide = NULL,
model = NULL,
model_file = NULL,
pam_audit = TRUE,
length_audit = TRUE,
learn_options_override = NULL)
}
nchunks <- ceiling(length(contextseqs) / chunksize)
contextchunks <- split(contextseqs, ceiling(seq_along(contextseqs)/chunksize))
cmessage('\t\tRun Doench2016 %d times on %d-seq chunks and concatenate (to preserve memory)', length(contextchunks), chunksize)
unlist(mclapply(contextchunks,
function(x){
azi$predict( reticulate::np_array(x),
aa_cut = NULL,
percent_peptide = NULL,
model = NULL,
model_file = NULL,
pam_audit = TRUE,
length_audit = TRUE,
learn_options_override = NULL)}))
}
#' Add efficiency scores and filter
......@@ -107,7 +118,7 @@ doench2016 <- function(
#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}}
#' @param method 'Doench2014' (default) or 'Doench2016'
#' (requires non-NULL argument python, virtualenv, or condaenv)
#' @param cutoff number: cutoff value
#' @param chunksize Doench2016 is executed in chunks of chunksize
#' @param verbose TRUE (default) or FALSE
#' @param plot TRUE (default) or FALSE
#' @param alpha_var NULL or string: var mapped to alpha in plot
......@@ -170,6 +181,7 @@ doench2016 <- function(
#' @export
add_efficiency <- function(
spacers, bsgenome, method= c('Doench2014', 'Doench2016')[1],
chunksize = 10000,
verbose = TRUE, plot = TRUE,
alpha_var = default_alpha_var(spacers)
){
......@@ -189,7 +201,8 @@ add_efficiency <- function(
scores <- switch(
method,
Doench2014 = doench2014(scoredt$crisprcontext, verbose=verbose),
Doench2016 = doench2016(scoredt$crisprcontext, verbose=verbose))
Doench2016 = doench2016(scoredt$crisprcontext, chunksize=chunksize,
verbose=verbose))
scoredt[ , (method) := scores ]
# Merge back in
......
# Load functions
require(reticulate)
use_condaenv('azienv')
import('azimuth')
require(AnnotationDbi)
require(BSgenome)
require(multicrispr)
......@@ -10,7 +13,7 @@ require(stringi)
# Load Brunello library
brunello_url <- 'https://www.addgene.org/static/cms/filer_public/8b/4c/8b4c89d9-eac1-44b2-bb2f-8fea95672705/broadgpp-brunello-library-contents.txt'
brunello_file <- '../multicrisprout/brunello/brunello.txt'
download.file(brunello_url, brunello_file)
#download.file(brunello_url, brunello_file)
brunello <- data.table::fread(brunello_file)
brunello %<>% extract(!is.na(`Target Gene ID`)) # rm pos controls
brunello[, .(N = length(unique(`Target Transcript`))), by = 'Target Gene ID'][, table(N)] # 1 transcr per gene
......@@ -75,6 +78,14 @@ spacers <- br %>% extend(0, +3) %>% multicrispr::find_spacers(bsgenome, plot = F
setdiff(brcoords, spcoords) %>% length()
setdiff(spcoords, brcoords) %>% length()
autonomics.plot::plot_venn(list(multicrispr = spcoords, brunello = brcoords))
brunello_spacers <- spacers[spcoords %in% brcoords]
names(mcols(brunello_spacers)) %<>% stringi::stri_replace_first_fixed('Doench2016', 'BrunelloDoench2016')
brunello_spacers[1:10] %>% add_efficiency(bsgenome, 'Doench2016', plot = FALSE)
brunello_spacers %<>% add_efficiency(bsgenome, 'Doench2016', plot = FALSE)
plot(brunello_spacers$BrunelloDoench2016, brunello_spacers$Doench2016)
plot(density(brunello_spacers$Doench2016))
# Find overlapping exons
ensdb <- multicrispr::EnsDb.Hsapiens.v98()
......@@ -112,10 +123,34 @@ autonomics.plot::plot_venn(list(brunello = brunellocoords, multicrispr = exonspa
exonspacers$in_brunello <- exonspacercoords %in% brunellocoords
exonspacers %>% subset(in_brunello==TRUE)
reticulate::use_condaenv('azienv'); #reticulate::use_condaenv('r3.6_python2.7')
reticulate::import('azimuth')
exonspacers %<>% add_context(bsgenome)
exonspacers %<>% add_efficiency(bsgenome, 'Doench2016')
contextseqs <- exonspacers$crisprcontext[1:1000]
score_doench2016_parallel <- function(contextseqs, chunksize = 100){
nchunks <- ceiling(length(contextseqs) / chunksize)
scores<-BiocParallel::bplapply(
seq_len(nchunks),
function(i){
chunkstart <- (i-1)*chunksize + 1
chunkend <- min(i*chunksize, length(contextseqs))
multicrispr:::doench2016(contextseqs[chunkstart:chunkend])
}) %>%
unlist()
}
parallel::mclapply()
exonspacers$Doench2016 <- 0
exonspacers$Doench2016[ 1:100000] <- multicrispr:::doench2016(exonspacers$crisprcontext[ 1:100000])
exonspacers$Doench2016[100001:200000] <- multicrispr:::doench2016(exonspacers$crisprcontext[100001:200000])
tmp <- exonspacers[1:100000]
tmp %<>% add_efficiency(bsgenome, 'Doench2016')
# add_genome_counts
bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
......
......@@ -9,6 +9,7 @@ add_efficiency(
spacers,
bsgenome,
method = c("Doench2014", "Doench2016")[1],
chunksize = 10000,
verbose = TRUE,
plot = TRUE,
alpha_var = default_alpha_var(spacers)
......@@ -32,13 +33,13 @@ filter_efficient(
\item{method}{'Doench2014' (default) or 'Doench2016'
(requires non-NULL argument python, virtualenv, or condaenv)}
\item{chunksize}{Doench2016 is executed in chunks of chunksize}
\item{verbose}{TRUE (default) or FALSE}
\item{plot}{TRUE (default) or FALSE}
\item{alpha_var}{NULL or string: var mapped to alpha in plot}
\item{cutoff}{number: cutoff value}
}
\value{
numeric vector
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment