From f837a1535d34bba9ff612b5e3d140afe84dac0df Mon Sep 17 00:00:00 2001 From: renewiegandt <rene.wiegandt@mpi-bn.mpg.de> Date: Tue, 9 Nov 2021 13:23:50 +0100 Subject: [PATCH] Fix for #1 --- R/05_manipulate_ranges.R | 8 + R/06_find_spacers.R | 1 + R/08_score_ontargets.R | 469 ++++++++++++++++++++------------------- 3 files changed, 244 insertions(+), 234 deletions(-) diff --git a/R/05_manipulate_ranges.R b/R/05_manipulate_ranges.R index f6c4f02..fc158b0 100644 --- a/R/05_manipulate_ranges.R +++ b/R/05_manipulate_ranges.R @@ -398,4 +398,12 @@ double_flank <- function( newgr } +#' Filter N +#' +#' @param spacers \code{\link[GenomicRanges]{GRanges-class}} +#' @return \code{\link[GenomicRanges]{GRanges-class}} +filter_N <- function(spacers, col="crisprspacer"){ + spacers <- spacers[!grepl("N",elementMetadata(spacers)[,col])] + spacers[!grepl("N",elementMetadata(spacers)[,"crisprpam"])] +} diff --git a/R/06_find_spacers.R b/R/06_find_spacers.R index 879255c..2217eee 100644 --- a/R/06_find_spacers.R +++ b/R/06_find_spacers.R @@ -258,6 +258,7 @@ find_spacers <- function(gr, bsgenome, spacer = strrep('N', 20), pam = 'NGG', spacers$crisprspacer <- getSeq(bsgenome, spacers, as.character=TRUE) spacers$crisprpam <- getSeq(bsgenome, pams, as.character=TRUE) if (verbose) message('\tFound ', length(spacers), ' spacers') + spacers %<>% filter_N() # Add on/offtargets spacers %<>% count_offtargets( bsgenome, targets = if (subtract_targets) gr else NULL, diff --git a/R/08_score_ontargets.R b/R/08_score_ontargets.R index babe95f..6b439bb 100644 --- a/R/08_score_ontargets.R +++ b/R/08_score_ontargets.R @@ -1,235 +1,236 @@ - -#' Add [-4, +3] context -#' -#' Add context for Doench2016 scoring -#' -#' @param spacers \code{\link[GenomicRanges]{GRanges-class}}: spacer ranges -#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}} -#' @param verbose logical(1) -#' @return character vector -#' @examples -#' # PE example -#' #----------- -#' require(magrittr) -#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 -#' gr <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp -#' HBB = 'chr11:5227002:-', # snp -#' HEXA = 'chr15:72346580-72346583:-', # del -#' CFTR = 'chr7:117559593-117559595:+'), # ins -#' bsgenome) -#' spacers <- find_spacers(extend_for_pe(gr), bsgenome, complement=FALSE) -#' (add_context(spacers, bsgenome)) -#' -#' # TFBS example -#' #------------- -#' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr') -#' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 -#' targets <- extend(bed_to_granges(bedfile, 'mm10')) -#' spacers <- find_spacers(targets, bsgenome) -#' (spacers %<>% add_context(bsgenome)) -#' @noRd -add_context <- function(spacers, bsgenome, verbose = TRUE){ - - # Prevent from stats::start from being used (leads to bug!) - contexts <- extend(spacers, -4, +6) - spacers$crisprcontext <- BSgenome::getSeq( - bsgenome, contexts, as.character = TRUE) - if (verbose) message('\t\tAdd (4-23-3) contextseqs') - spacers -} - - -doench2014 <- function( - contextseqs, - python = NULL, - virtualenv = NULL, - condaenv = NULL, - verbose -){ - - # Assert - assert_is_character(contextseqs) - assert_all_are_true(nchar(contextseqs)==30) - assert_is_a_bool(verbose) - - # Message - if (verbose) message('\t\tScore contextseqs with Doench2014') - - # Read featureWeightMatrix - fwfile <- system.file("extdata/DoenchNBT2014.csv", package = "CRISPRseek") - fwmatrix <- read.csv(fwfile, header = TRUE) - - # Score and return - CRISPRseek::calculategRNAEfficiency(contextseqs, - baseBeforegRNA = 4, - featureWeightMatrix = fwmatrix) %>% - magrittr::extract(, 1) -} - -# Note: mclapply vs. bplapply -# I first used BiocParallel::bplapply(). -# That works great on linux, but it fails on windows. -# support.bioconductor.org/p/92587/ explains likely why -# Windows doensn't allow forking (where a child process inherits parent env) -# Instead bplapply defaults to multithreading. -# In that scenario, the reticulation env is not correctly passed. -# mclapply seems like best choice: fork on linux - execute serially on win -doench2016 <- function( - contextseqs, - chunksize = 10000, - verbose = TRUE -){ - # Assert - is_identical_to_true(reticulate::py_module_available('azimuth')) - assert_is_character(contextseqs) - assert_all_are_true(nchar(contextseqs)==30) - assert_is_a_bool(verbose) - - # Message - if (verbose) message('\t\tScore contextseqs with Doench2016 (azimuth)') - start_time <- Sys.time() - - # Score - azi <- reticulate::import('azimuth.model_comparison', delay_load = TRUE) - nchunks <- ceiling(length(contextseqs) / chunksize) - contextchunks <- split( - contextseqs, ceiling(seq_along(contextseqs)/chunksize)) - txt <- paste0('\t\tRun Doench2016 %d times on %d-seq chunks ', - 'to preserve memory') - cmessage(txt, length(contextchunks), chunksize) - mc.cores <- if (is_windows()) 1 else max(1, parallel::detectCores()-2) - doench2016scores <- unlist(parallel::mclapply(contextchunks, - function(x){ - reticulate::py_suppress_warnings( - azi$predict( reticulate::np_array(x), - aa_cut = NULL, - percent_peptide = NULL, - model = NULL, - model_file = NULL, - pam_audit = TRUE, - length_audit = TRUE, - learn_options_override = NULL))}, - mc.cores = mc.cores)) - - # Return - end_time <- Sys.time() - if (verbose) cmessage('\t\tCompleted in %s', - format(end_time - start_time, digits = 2)) - doench2016scores -} - - -#' Add on-target efficiency scores -#' -#' Add Doench2014 or Doench2016 on-target efficiency scores -#' -#' \code{add_ontargets} adds efficiency scores -#' \code{filter_ontargets} adds efficiency scores and filters on them -#' -#' @param spacers \code{\link[GenomicRanges]{GRanges-class}}: spacers -#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}} -#' @param ontargetmethod 'Doench2014' (default) or 'Doench2016' -#' (requires non-NULL argument python, virtualenv, or condaenv) -#' @param chunksize Doench2016 is executed in chunks of chunksize -#' @param verbose TRUE (default) or FALSE -#' @param plot TRUE (default) or FALSE -#' @param ... passed to \code{\link{plot_intervals}} -#' @return numeric vector -#' @examples -#' # Install azimuth -#' #---------------- -#' ## With reticulate -#' # require(reticulate) -#' # conda_create('azienv', c('python=2.7')) -#' # use_condaenv('azienv') -#' # py_install(c('azimuth', 'scikit-learn==0.17.1', 'biopython=='1.76'), -#' # 'azienv', pip = TRUE) -#' -#' ## Directly -#' # conda create --name azienv python=2.7 -#' # conda activate azienv -#' # pip install scikit-learn==0.17.1 -#' # pip install biopython==1.76 -#' # pip install azimuth -#' -#' # PE example -#' #----------- -#' require(magrittr) -#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 -#' targets <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp -#' HBB = 'chr11:5227002:-', # snp -#' HEXA = 'chr15:72346580-72346583:-', # del -#' CFTR = 'chr7:117559593-117559595:+'), # ins -#' bsgenome) -#' spacers <- find_primespacers(targets, bsgenome, ontargetmethod=NULL, -#' offtargetmethod=NULL) -#' spacers %<>% score_ontargets(bsgenome, 'Doench2014') -#' # reticulate::use_condaenv('azienv') -#' # reticulate::import('azimuth') -#' # spacers %<>% score_ontargets(bsgenome, 'Doench2016') -#' -#' # TFBS example -#' #------------- -#' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr') -#' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 -#' targets <- extend(bed_to_granges(bedfile, 'mm10')) -#' spacers <- find_spacers(targets, bsgenome, ontargetmethod=NULL, -#' offtargetmethod=NULL) -#' spacers %<>% score_ontargets(bsgenome, 'Doench2014') -#' # reticulate::use_condaenv('azienv') -#' # reticulate::import('azimuth') -#' # spacers %>% score_ontargets(bsgenome, 'Doench2016') -#' @references -#' Doench 2014, Rational design of highly active sgRNAs for -#' CRISPR-Cas9-mediated gene inactivation. Nature Biotechnology, -#' doi: 10.1038/nbt.3026 -#' -#' Doench 2016, Optimized sgRNA design to maximize activity and minimize -#' off-target effects of CRISPR-Cas9. Nature Biotechnology, -#' doi: 10.1038/nbt.3437 -#' -#' Python module azimuth: github/MicrosoftResearch/azimuth -#' @export -score_ontargets <- function( - spacers, bsgenome, ontargetmethod= c('Doench2014', 'Doench2016')[1], - chunksize = 10000, verbose = TRUE, plot = TRUE, ... -){ - # Assert - crisprcontext <- NULL - assert_is_all_of(spacers, 'GRanges') - if (is.null(ontargetmethod)) return(spacers) - assert_is_a_string(ontargetmethod) - assert_is_subset(ontargetmethod, c('Doench2014', 'Doench2016')) - if (ontargetmethod %in% names(mcols(spacers))){ - mcols(spacers)[[ontargetmethod]] <- NULL} - - # Add contextseq - if (verbose) cmessage('\tScore ontargets') - spacers %<>% add_context(bsgenome, verbose = verbose) - spacerdt <- gr2dt(spacers) - scoredt <- data.table(crisprcontext = unique(spacerdt$crisprcontext)) - - # Score - scores <- switch(ontargetmethod, - Doench2014 = doench2014(scoredt$crisprcontext, verbose=verbose), - Doench2016 = doench2016(scoredt$crisprcontext, chunksize=chunksize, - verbose=verbose)) - scoredt[ , (ontargetmethod) := scores ] - - # Merge back in - mergedt <- merge(spacerdt, scoredt, - by='crisprcontext', sort=FALSE, all.x=TRUE) - mergedt[, crisprcontext := NULL] - spacers <- dt2gr(mergedt, seqinfo = seqinfo(spacers)) - - # Plot - if (plot) print(plot_intervals(spacers, ...)) - - # Return - spacers -} - - -default_alpha_var <- function(gr){ - if ('off' %in% names(mcols(gr))) 'off' else NULL + +#' Add [-4, +3] context +#' +#' Add context for Doench2016 scoring +#' +#' @param spacers \code{\link[GenomicRanges]{GRanges-class}}: spacer ranges +#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}} +#' @param verbose logical(1) +#' @return character vector +#' @examples +#' # PE example +#' #----------- +#' require(magrittr) +#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 +#' gr <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp +#' HBB = 'chr11:5227002:-', # snp +#' HEXA = 'chr15:72346580-72346583:-', # del +#' CFTR = 'chr7:117559593-117559595:+'), # ins +#' bsgenome) +#' spacers <- find_spacers(extend_for_pe(gr), bsgenome, complement=FALSE) +#' (add_context(spacers, bsgenome)) +#' +#' # TFBS example +#' #------------- +#' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr') +#' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 +#' targets <- extend(bed_to_granges(bedfile, 'mm10')) +#' spacers <- find_spacers(targets, bsgenome) +#' (spacers %<>% add_context(bsgenome)) +#' @noRd +add_context <- function(spacers, bsgenome, verbose = TRUE){ + + # Prevent from stats::start from being used (leads to bug!) + contexts <- extend(spacers, -4, +6) + spacers$crisprcontext <- BSgenome::getSeq( + bsgenome, contexts, as.character = TRUE) + if (verbose) message('\t\tAdd (4-23-3) contextseqs') + spacers +} + + +doench2014 <- function( + contextseqs, + python = NULL, + virtualenv = NULL, + condaenv = NULL, + verbose +){ + + # Assert + assert_is_character(contextseqs) + assert_all_are_true(nchar(contextseqs)==30) + assert_is_a_bool(verbose) + + # Message + if (verbose) message('\t\tScore contextseqs with Doench2014') + + # Read featureWeightMatrix + fwfile <- system.file("extdata/DoenchNBT2014.csv", package = "CRISPRseek") + fwmatrix <- read.csv(fwfile, header = TRUE) + + # Score and return + CRISPRseek::calculategRNAEfficiency(contextseqs, + baseBeforegRNA = 4, + featureWeightMatrix = fwmatrix) %>% + magrittr::extract(, 1) +} + +# Note: mclapply vs. bplapply +# I first used BiocParallel::bplapply(). +# That works great on linux, but it fails on windows. +# support.bioconductor.org/p/92587/ explains likely why +# Windows doensn't allow forking (where a child process inherits parent env) +# Instead bplapply defaults to multithreading. +# In that scenario, the reticulation env is not correctly passed. +# mclapply seems like best choice: fork on linux - execute serially on win +doench2016 <- function( + contextseqs, + chunksize = 10000, + verbose = TRUE +){ + # Assert + is_identical_to_true(reticulate::py_module_available('azimuth')) + assert_is_character(contextseqs) + assert_all_are_true(nchar(contextseqs)==30) + assert_is_a_bool(verbose) + + # Message + if (verbose) message('\t\tScore contextseqs with Doench2016 (azimuth)') + start_time <- Sys.time() + + # Score + azi <- reticulate::import('azimuth.model_comparison', delay_load = TRUE) + nchunks <- ceiling(length(contextseqs) / chunksize) + contextchunks <- split( + contextseqs, ceiling(seq_along(contextseqs)/chunksize)) + txt <- paste0('\t\tRun Doench2016 %d times on %d-seq chunks ', + 'to preserve memory') + cmessage(txt, length(contextchunks), chunksize) + mc.cores <- if (is_windows()) 1 else max(1, parallel::detectCores()-2) + doench2016scores <- unlist(parallel::mclapply(contextchunks, + function(x){ + reticulate::py_suppress_warnings( + azi$predict( reticulate::np_array(x), + aa_cut = NULL, + percent_peptide = NULL, + model = NULL, + model_file = NULL, + pam_audit = TRUE, + length_audit = TRUE, + learn_options_override = NULL))}, + mc.cores = mc.cores)) + + # Return + end_time <- Sys.time() + if (verbose) cmessage('\t\tCompleted in %s', + format(end_time - start_time, digits = 2)) + doench2016scores +} + + +#' Add on-target efficiency scores +#' +#' Add Doench2014 or Doench2016 on-target efficiency scores +#' +#' \code{add_ontargets} adds efficiency scores +#' \code{filter_ontargets} adds efficiency scores and filters on them +#' +#' @param spacers \code{\link[GenomicRanges]{GRanges-class}}: spacers +#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}} +#' @param ontargetmethod 'Doench2014' (default) or 'Doench2016' +#' (requires non-NULL argument python, virtualenv, or condaenv) +#' @param chunksize Doench2016 is executed in chunks of chunksize +#' @param verbose TRUE (default) or FALSE +#' @param plot TRUE (default) or FALSE +#' @param ... passed to \code{\link{plot_intervals}} +#' @return numeric vector +#' @examples +#' # Install azimuth +#' #---------------- +#' ## With reticulate +#' # require(reticulate) +#' # conda_create('azienv', c('python=2.7')) +#' # use_condaenv('azienv') +#' # py_install(c('azimuth', 'scikit-learn==0.17.1', 'biopython=='1.76'), +#' # 'azienv', pip = TRUE) +#' +#' ## Directly +#' # conda create --name azienv python=2.7 +#' # conda activate azienv +#' # pip install scikit-learn==0.17.1 +#' # pip install biopython==1.76 +#' # pip install azimuth +#' +#' # PE example +#' #----------- +#' require(magrittr) +#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 +#' targets <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp +#' HBB = 'chr11:5227002:-', # snp +#' HEXA = 'chr15:72346580-72346583:-', # del +#' CFTR = 'chr7:117559593-117559595:+'), # ins +#' bsgenome) +#' spacers <- find_primespacers(targets, bsgenome, ontargetmethod=NULL, +#' offtargetmethod=NULL) +#' spacers %<>% score_ontargets(bsgenome, 'Doench2014') +#' # reticulate::use_condaenv('azienv') +#' # reticulate::import('azimuth') +#' # spacers %<>% score_ontargets(bsgenome, 'Doench2016') +#' +#' # TFBS example +#' #------------- +#' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr') +#' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 +#' targets <- extend(bed_to_granges(bedfile, 'mm10')) +#' spacers <- find_spacers(targets, bsgenome, ontargetmethod=NULL, +#' offtargetmethod=NULL) +#' spacers %<>% score_ontargets(bsgenome, 'Doench2014') +#' # reticulate::use_condaenv('azienv') +#' # reticulate::import('azimuth') +#' # spacers %>% score_ontargets(bsgenome, 'Doench2016') +#' @references +#' Doench 2014, Rational design of highly active sgRNAs for +#' CRISPR-Cas9-mediated gene inactivation. Nature Biotechnology, +#' doi: 10.1038/nbt.3026 +#' +#' Doench 2016, Optimized sgRNA design to maximize activity and minimize +#' off-target effects of CRISPR-Cas9. Nature Biotechnology, +#' doi: 10.1038/nbt.3437 +#' +#' Python module azimuth: github/MicrosoftResearch/azimuth +#' @export +score_ontargets <- function( + spacers, bsgenome, ontargetmethod= c('Doench2014', 'Doench2016')[1], + chunksize = 10000, verbose = TRUE, plot = TRUE, ... +){ + # Assert + crisprcontext <- NULL + assert_is_all_of(spacers, 'GRanges') + if (is.null(ontargetmethod)) return(spacers) + assert_is_a_string(ontargetmethod) + assert_is_subset(ontargetmethod, c('Doench2014', 'Doench2016')) + if (ontargetmethod %in% names(mcols(spacers))){ + mcols(spacers)[[ontargetmethod]] <- NULL} + + # Add contextseq + if (verbose) cmessage('\tScore ontargets') + spacers %<>% add_context(bsgenome, verbose = verbose) + spacers %<>% filter_N(col="crisprcontext") + spacerdt <- gr2dt(spacers) + scoredt <- data.table(crisprcontext = unique(spacerdt$crisprcontext)) + + # Score + scores <- switch(ontargetmethod, + Doench2014 = doench2014(scoredt$crisprcontext, verbose=verbose), + Doench2016 = doench2016(scoredt$crisprcontext, chunksize=chunksize, + verbose=verbose)) + scoredt[ , (ontargetmethod) := scores ] + + # Merge back in + mergedt <- merge(spacerdt, scoredt, + by='crisprcontext', sort=FALSE, all.x=TRUE) + mergedt[, crisprcontext := NULL] + spacers <- dt2gr(mergedt, seqinfo = seqinfo(spacers)) + + # Plot + if (plot) print(plot_intervals(spacers, ...)) + + # Return + spacers +} + + +default_alpha_var <- function(gr){ + if ('off' %in% names(mcols(gr))) 'off' else NULL } \ No newline at end of file -- GitLab