Commit 672ba7c7 authored by aditya.bhagwat's avatar aditya.bhagwat
Browse files

BiocCheck

parent f7babb34
Pipeline #130119 passed with stages
in 30 seconds
......@@ -70,6 +70,7 @@ importFrom(assertive.files,assert_all_are_existing_files)
importFrom(assertive.numbers,assert_all_are_less_than)
importFrom(assertive.properties,assert_has_names)
importFrom(assertive.properties,has_names)
importFrom(assertive.reflection,is_windows)
importFrom(assertive.sets,assert_is_subset)
importFrom(assertive.strings,assert_all_are_matching_regex)
importFrom(assertive.types,assert_is_a_bool)
......
......@@ -3,6 +3,7 @@
#' @importFrom assertive.files assert_all_are_existing_files
#' @importFrom assertive.numbers assert_all_are_less_than
#' @importFrom assertive.properties has_names assert_has_names
#' @importFrom assertive.reflection is_windows
#' @importFrom assertive.sets assert_is_subset
#' @importFrom assertive.strings assert_all_are_matching_regex
#' @importFrom assertive.types assert_is_all_of assert_is_any_of
......
......@@ -219,19 +219,19 @@ extend <- function(
# Record
newgr <- gr
shift <- sprintf('(%s%d,%s%d)',
csign(start), abs(start), csign(end), abs(end))
csign(start), abs(start), csign(end), abs(end))
txt <- sprintf('\t\t%d%sextensions %s',
length(newgr),
ifelse(!strandaware, ' (strandagnostic) ', ' '),
shift)
length(newgr),
ifelse(!strandaware, ' (strandagnostic) ', ' '),
shift)
# Extend
GenomicRanges::start(newgr) <- GenomicRanges::start(newgr) + start
GenomicRanges::end( newgr) <- GenomicRanges::end( newgr) + end
if (strandaware){
idx <- as.logical(strand(newgr)=='-')
GenomicRanges::start(newgr)[idx] <- GenomicRanges::start(gr)[idx] - end
GenomicRanges::end( newgr)[idx] <- GenomicRanges::end(gr)[idx] - start
idx <- as.logical(strand(newgr)=='-')
GenomicRanges::start(newgr)[idx] <- GenomicRanges::start(gr)[idx] - end
GenomicRanges::end( newgr)[idx] <- GenomicRanges::end(gr)[idx] - start
}
# Add seq
......@@ -285,12 +285,12 @@ add_inverse_strand <- function(gr, verbose = FALSE, plot = FALSE, ...){
# Assert
assertive::assert_is_all_of(gr, 'GRanges')
# Invert
revcomps <- invertStrand(gr)
if ('seq' %in% names(mcols(gr))){
revcomps$seq <- as.character(Biostrings::reverseComplement(
DNAStringSet(gr$seq)))
DNAStringSet(gr$seq)))
}
# Concatenate
......@@ -349,14 +349,14 @@ add_inverse_strand <- function(gr, verbose = FALSE, plot = FALSE, ...){
#' double_flank(gr, plot = TRUE)
#' @export
double_flank <- function(
gr,
upstart = -200,
upend = -1,
downstart = 1,
downend = 200,
plot = FALSE,
linetype_var = 'set',
...
gr,
upstart = -200,
upend = -1,
downstart = 1,
downend = 200,
plot = FALSE,
linetype_var = 'set',
...
){
# Up flank, down flank, concatenate
......@@ -371,13 +371,13 @@ double_flank <- function(
# Plot
if (plot){
gr$set <- 'original'
newgr$set <- 'flanks'
allgr <- c(gr, newgr)
allgr$set %<>% factor(c('original', 'flanks'))
print(plot_intervals(
gr$set <- 'original'
newgr$set <- 'flanks'
allgr <- c(gr, newgr)
allgr$set %<>% factor(c('original', 'flanks'))
print(plot_intervals(
allgr, linetype_var = linetype_var, title = txt, ...))
newgr$set <- NULL
newgr$set <- NULL
}
# Return
......
# Convert PAM into regex format
# @examples
# x <- 'NGG'
# pam_to_regex <- function(x){
# assert_is_a_string(x)
# x %>% stringi::stri_replace_all_fixed('N', '[ACGT]')
# }
# This regex-based function fails to find all crispr spacers
# It is, therefore, no longer used, just kept for reference purposes.
# Example explanation of why it fails
# x <- Biostrings::DNAString('AGCAGCTGGGGCAGTGGTGGGGGGCCTTGGCGGCTACA')
# stringi::stri_locate_all_regex(as.character(x), '[ACGT]{21}GG')
# Biostrings::matchPattern('NNNNNNNNNNNNNNNNNNNNNGG', x, fixed = FALSE)
# regex_based_find_crispr_spacers <- function(
# gr, bsgenome, pam = 'NGG', plot = TRUE, verbose = TRUE){
#
# # Assert. Import. Comply
# assert_is_all_of(gr, 'GRanges')
# assert_is_all_of(bsgenome, 'BSgenome')
# assert_is_a_bool(verbose)
# start <- substart <- crispr_start <- NULL
# end <- subend <- crispr_end <- strand <- seqnames <- NULL
# gr %<>% add_seq(bsgenome)
# gr %<>% name_uniquely()
#
# # Find crispr sites in targetranges
# pattern <- paste0('[ACGT]{20}', pam_to_regex(pam))
# if (verbose) cmessage('\tFind %s crispr sites', pattern)
# targetdt <- as.data.table(gr)
# res <- targetdt$seq %>% stri_locate_all_regex(pattern)
# cextract1 <- function(y) y[, 1] %>% paste0(collapse=';')
# cextract2 <- function(y) y[, 2] %>% paste0(collapse=';')
# targetdt [ , substart := vapply( res, cextract1, character(1)) ]
# targetdt [ , subend := vapply( res, cextract2, character(1)) ]
#
# # Rm crispr-free targetranges
# idx <- targetdt[, substart == 'NA']
# if (sum(idx)>0){
# if (verbose) cmessage('\t\tRm %d ranges with no crispr site',
# sum(idx))
# targetdt %<>% extract(!idx)
# }
#
# # Transform into crispr ranges
# targetdt[, targetstart := start]
# targetdt[, targetend := end ]
#
# sites_dt <- tidyr::separate_rows(targetdt, substart, subend) %>%
# data.table() %>%
# extract(, substart := as.numeric(substart)) %>%
# extract(, subend := as.numeric(subend)) %>%
# extract(, seq := substr(seq, substart, subend)) %>%
# extract( strand=='+', crispr_start := start + substart - 1) %>%
# extract( strand=='+', crispr_end := start + subend - 1) %>%
# extract( strand=='-', crispr_start := end - subend + 1) %>%
# extract( strand=='-', crispr_end := end - substart + 1) %>%
# extract(,
# list(seqnames = seqnames, start = crispr_start,
# end = crispr_end, strand = strand, seq = seq,
# targetname = targetname, targetstart = targetstart,
# targetend = targetend))
# sites <- GRanges(unique(sites_dt), seqinfo = seqinfo(gr))
# sites$site <- uniquify(sites$targetname)
# spacer <- sites %>% extend( 0, -3, stranded=TRUE, bsgenome=bsgenome)
# pam <- sites %>% down_flank(-2, 0, stranded=TRUE, bsgenome=bsgenome)
# spacer$spacer <- spacer$seq
# spacer$seq <- NULL
# spacer$pam <- pam$seq
#
# # Plot. Message. Return
# if (plot){
# original <- copy(sites, start=sites$targetstart, end=sites$targetend)
# original$set <- 'target'
# spacer$set <- 'spacer'
# plot_intervals(c(original, spacer), color_var = 'set',
# size_var = 'set', y = 'site')
# sites$set <- NULL
# }
# if (verbose) cmessage('\t\t%d cas9 spacers across %d ranges',
# length(unique(spacer$spacer)), length(spacer))
# return(sites)
# }
#' Extract subranges
#'
#' Extract subranges from a \code{\link[GenomicRanges]{GRanges-class}} object
#'
#' Extract subranges (defined \code{\link[IRanges]{IRanges-class}} format) from a
#' \code{\link{GRanges}} object
#'
#' @param gr \code{\link[GenomicRanges]{GRanges-class}}
#' @param ir \code{\link[IRanges]{IRanges-class}}: subranges to be extracted
#' @param plot TRUE or FALSE (default)
......@@ -198,90 +281,3 @@ extend_for_pe <- function(
}
# Convert PAM into regex format
# @examples
# x <- 'NGG'
# pam_to_regex <- function(x){
# assert_is_a_string(x)
# x %>% stringi::stri_replace_all_fixed('N', '[ACGT]')
# }
# This regex-based function fails to find all crispr spacers
# It is, therefore, no longer used, just kept for reference purposes.
# Example explanation of why it fails
# x <- Biostrings::DNAString('AGCAGCTGGGGCAGTGGTGGGGGGCCTTGGCGGCTACA')
# stringi::stri_locate_all_regex(as.character(x), '[ACGT]{21}GG')
# Biostrings::matchPattern('NNNNNNNNNNNNNNNNNNNNNGG', x, fixed = FALSE)
# regex_based_find_crispr_spacers <- function(
# gr, bsgenome, pam = 'NGG', plot = TRUE, verbose = TRUE){
#
# # Assert. Import. Comply
# assert_is_all_of(gr, 'GRanges')
# assert_is_all_of(bsgenome, 'BSgenome')
# assert_is_a_bool(verbose)
# start <- substart <- crispr_start <- NULL
# end <- subend <- crispr_end <- strand <- seqnames <- NULL
# gr %<>% add_seq(bsgenome)
# gr %<>% name_uniquely()
#
# # Find crispr sites in targetranges
# pattern <- paste0('[ACGT]{20}', pam_to_regex(pam))
# if (verbose) cmessage('\tFind %s crispr sites', pattern)
# targetdt <- as.data.table(gr)
# res <- targetdt$seq %>% stri_locate_all_regex(pattern)
# cextract1 <- function(y) y[, 1] %>% paste0(collapse=';')
# cextract2 <- function(y) y[, 2] %>% paste0(collapse=';')
# targetdt [ , substart := vapply( res, cextract1, character(1)) ]
# targetdt [ , subend := vapply( res, cextract2, character(1)) ]
#
# # Rm crispr-free targetranges
# idx <- targetdt[, substart == 'NA']
# if (sum(idx)>0){
# if (verbose) cmessage('\t\tRm %d ranges with no crispr site',
# sum(idx))
# targetdt %<>% extract(!idx)
# }
#
# # Transform into crispr ranges
# targetdt[, targetstart := start]
# targetdt[, targetend := end ]
#
# sites_dt <- tidyr::separate_rows(targetdt, substart, subend) %>%
# data.table() %>%
# extract(, substart := as.numeric(substart)) %>%
# extract(, subend := as.numeric(subend)) %>%
# extract(, seq := substr(seq, substart, subend)) %>%
# extract( strand=='+', crispr_start := start + substart - 1) %>%
# extract( strand=='+', crispr_end := start + subend - 1) %>%
# extract( strand=='-', crispr_start := end - subend + 1) %>%
# extract( strand=='-', crispr_end := end - substart + 1) %>%
# extract(,
# list(seqnames = seqnames, start = crispr_start,
# end = crispr_end, strand = strand, seq = seq,
# targetname = targetname, targetstart = targetstart,
# targetend = targetend))
# sites <- GRanges(unique(sites_dt), seqinfo = seqinfo(gr))
# sites$site <- uniquify(sites$targetname)
# spacer <- sites %>% extend( 0, -3, stranded=TRUE, bsgenome=bsgenome)
# pam <- sites %>% down_flank(-2, 0, stranded=TRUE, bsgenome=bsgenome)
# spacer$spacer <- spacer$seq
# spacer$seq <- NULL
# spacer$pam <- pam$seq
#
# # Plot. Message. Return
# if (plot){
# original <- copy(sites, start=sites$targetstart, end=sites$targetend)
# original$set <- 'target'
# spacer$set <- 'spacer'
# plot_intervals(c(original, spacer), color_var = 'set',
# size_var = 'set', y = 'site')
# sites$set <- NULL
# }
# if (verbose) cmessage('\t\t%d cas9 spacers across %d ranges',
# length(unique(spacer$spacer)), length(spacer))
# return(sites)
# }
......@@ -91,18 +91,17 @@ index_targets <- function(
# Reduce
targets %<>% GenomicRanges::reduce()
if (verbose) cmessage('\t\t%d ranges after merging overlaps', length(targets))
if (verbose) cmessage(
'\t\t%d ranges after merging overlaps', length(targets))
# Write to fasta
targetdir <- target_dir(outdir)
targetfasta <- target_fasta(outdir)
targetseqs <- BSgenome::getSeq(bsgenome, targets)
names(targetseqs) <- sprintf(
'%s:%s-%s:%s',
as.character(seqnames(targets)),
start(targets),
end(targets),
strand(targets))
'%s:%s-%s:%s',
as.character(seqnames(targets)), start(targets), end(targets),
strand(targets))
if (verbose) cmessage('\t\tWrite seqs to %s', targetfasta)
dir.create(dirname(targetfasta), showWarnings = FALSE, recursive = TRUE)
Biostrings::writeXStringSet(targetseqs, targetfasta)
......@@ -152,7 +151,8 @@ read_bowtie_results <- function(outfile){
col.names = c('readname', 'strand', 'target', 'position',
'readseq', 'quality', 'matches', 'mismatches'))
dt[ is.na(mismatches), mismatch := 0]
dt[!is.na(mismatches), mismatch := stringi::stri_count_fixed(mismatches, '>')]
dt[!is.na(mismatches), mismatch := stringi::stri_count_fixed(
mismatches, '>')]
results <- dt %>%
extract( , .N, keyby = .(readname, mismatch)) %>%
......@@ -210,7 +210,8 @@ match_seqs <- function(seqs, indexdir, norc, mismatches = 2,
# Map reads
outfile <- spacer_matchfile(outdir, indexdir)
if (verbose) cmessage('\t\tMap reads: %s', outfile)
run_bowtie(readfasta, indexdir, outfile, norc = norc, mismatches = mismatches)
run_bowtie(
readfasta, indexdir, outfile, norc = norc, mismatches = mismatches)
# Load results
if (verbose) cmessage('\t\tLoad results')
......@@ -277,8 +278,8 @@ match_spacers <- function(
spacerseqs <- unique(spacers$crisprspacer)
pamseqs <- expand_iupac_ambiguities(pam)
crisprdt <- data.table(
crisprspacer = rep(spacerseqs, each = length(pamseqs)),
crisprpam = rep(pamseqs, times = length(spacerseqs))) %>%
crisprspacer = rep(spacerseqs, each=length(pamseqs)),
crisprpam = rep(pamseqs, times=length(spacerseqs))) %>%
extract(, crispr := paste0(crisprspacer, pamseqs))
crisprseqs <- unique(crisprdt$crispr)
......
......@@ -92,9 +92,12 @@ doench2016 <- function(
# Score
azi <- reticulate::import('azimuth.model_comparison', delay_load = TRUE)
nchunks <- ceiling(length(contextseqs) / chunksize)
contextchunks <- split(contextseqs, ceiling(seq_along(contextseqs)/chunksize))
cmessage('\t\tRun Doench2016 %d times on %d-seq chunks and concatenate (to preserve memory)', length(contextchunks), chunksize)
mc.cores <- if (assertive.reflection::is_windows()) 1 else parallel::detectCores()-2
contextchunks <- split(
contextseqs, ceiling(seq_along(contextseqs)/chunksize))
txt <- paste0('\t\tRun Doench2016 %d times on %d-seq chunks ',
'and concatenate (to preserve memory)')
cmessage(txt, length(contextchunks), chunksize)
mc.cores <- if (is_windows()) 1 else parallel::detectCores()-2
doench2016scores <- unlist(parallel::mclapply(contextchunks,
function(x){
reticulate::py_suppress_warnings(
......@@ -216,15 +219,16 @@ add_efficiency <- function(
scoredt[ , (method) := scores ]
# Merge back in
mergedt <- merge(spacerdt, scoredt, by='crisprcontext', sort=FALSE, all.x=TRUE)
mergedt <- merge(spacerdt, scoredt,
by='crisprcontext', sort=FALSE, all.x=TRUE)
spacers <- dt2gr(mergedt, seqinfo = seqinfo(spacers))
# Plot
if (plot){
scores <- mcols(spacers)[[method]]
tertiles <- stats::quantile(scores, c(0.33, 0.66, 1))
labels <- sprintf('%s < %s (%s)',
method, as.character(round(tertiles, 2)), names(tertiles))
labels <- sprintf('%s < %s (%s)', method,
as.character(round(tertiles, 2)), names(tertiles))
spacers$efficiency <- cut(scores, c(0, tertiles), labels)
p <- plot_intervals(
spacers, size_var = 'efficiency', alpha_var = alpha_var) +
......
......@@ -37,4 +37,4 @@ pdf('Z:/PaperInPrep/multicrispr/figures/fig1/fig1.pdf',
width = figure_width( fig, 'inch'),
height = figure_height(fig, 'inch'))
print(fig)
dev.off()
\ No newline at end of file
dev.off()
......@@ -19,10 +19,6 @@ extract_subranges(gr, ir, plot = FALSE)
\description{
Extract subranges from a \code{\link[GenomicRanges]{GRanges-class}} object
}
\details{
Extract subranges (defined \code{\link[IRanges]{IRanges-class}} format) from a
\code{\link{GRanges}} object
}
\examples{
gr <- GenomicRanges::GRanges(c(A = 'chr1:1-100:+', B = 'chr1:1-100:-'))
gr$targetname <- 'AB'
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment