Commit ff65b53d authored by aditya.bhagwat's avatar aditya.bhagwat
Browse files

Organize scripts

parent 61c0cd25
grA.png

129 Bytes

require(magrittr)
require(multicrispr)
#=============
# TFBS
#=============
# Decide which to keep
require(multicrispr)
require(magrittr)
reticulate::use_condaenv('azienv')
bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
bedfile <- system.file('extdata/SRF.bed', package='multicrispr')
targets <- bed_to_granges(bedfile, genome='mm10', plot = FALSE)
targets <- multicrispr::bed_to_granges(bedfile, genome='mm10', plot = FALSE)
png('graphs/srf_karyogram.png')
plot_karyogram(targets, title = NULL)
multicrispr::plot_karyogram(targets, title = NULL)
dev.off()
extended <- extend(targets, -22, +22)
......@@ -23,7 +25,7 @@ targets['T0151'] %>% down_flank() %>% plot_intervals()
spacers <- extended %>%
find_spacers(bsgenome) %>%
add_specificity(extended, bsgenome) %>%
add_efficiency(bsgenome, method = 'Doench2016', condaenv='azienv')
add_efficiency(bsgenome, method = 'Doench2016')
spacers %>% subset(seqnames == 'chr1') %>%
subset(specific == TRUE) %>%
gr2dt() %>%
......@@ -35,65 +37,77 @@ spacers2 %<>% subset(specific==TRUE)
spacers2$score <- spacers2$name <- spacers2$crisprcontext <- spacers2$specific <- NULL
spacers2
targets %<>% subset(targetname %in% c('T0050', 'T0151'))
extended%<>% subset(targetname %in% c('T0050', 'T0151'))
spacers %<>% subset(targetname %in% c('T0050', 'T0151'))
targets %<>% subset(targetname %in% c('T0050')) # T0151
extended%<>% subset(targetname %in% c('T0050'))
spacers %<>% subset(targetname %in% c('T0050'))
# Create plots: original
p <- plot_intervals(targets) +
p <- plot_intervals(targets, color_var = 'targetname') +
ggplot2::scale_color_manual(values = c(T0050 = "#00BFC4")) +
ggplot2::guides(color = FALSE, linetype = FALSE)
ggplot2::ggsave('graphs/srf.png', p, width = 2.5, height = 2)
ggplot2::ggsave('graphs/srf01.pdf', p, width =2.2, height = 1.5, device = grDevices::cairo_pdf)
#ggplot2::ggsave('graphs/srf01.png', p, width = 2.2, height = 1.5)
p <- plot_intervals(extended) +
ggplot2::scale_color_manual(values = c(T0050 = "#00BFC4")) +
ggplot2::guides(color = FALSE, linetype = FALSE)
ggplot2::ggsave('graphs/srf_extended.png', p, width = 2.5, height = 2)
ggplot2::ggsave('graphs/srf02_extended.pdf', p, width =2.2, height = 1.5, device = grDevices::cairo_pdf)
#ggplot2::ggsave('graphs/srf02_extended.png', p, width = 2.2, height = 1.5)
p <- plot_intervals(spacers) +
ggplot2::scale_color_manual(values = c(T0050 = "#00BFC4")) +
ggplot2::guides(color = FALSE)
ggplot2::ggsave('graphs/srf_spacers.png', p, width = 2.5, height = 2)
ggplot2::ggsave('graphs/srf03_spacers.pdf', p, width =2.2, height = 1.5, device = grDevices::cairo_pdf)
#ggplot2::ggsave('graphs/srf03_spacers.png', p, width = 2.2, height = 1.5)
p <- plot_intervals(spacers, alpha_var = 'specific') +
ggplot2::scale_color_manual(values = c(T0050 = "#00BFC4")) +
ggplot2::scale_alpha_manual(values = c(`TRUE`=1, `FALSE`=0.20)) +
ggplot2::guides(color = FALSE, alpha = FALSE)
ggplot2::ggsave('graphs/srf_specific.png', p, width = 2.5, height = 2)
ggplot2::ggsave('graphs/srf04_specific.pdf', p, width = 2.2, height = 1.5, device = grDevices::cairo_pdf)
#ggplot2::ggsave('graphs/srf04_specific.png', p, width = 2.2, height = 1.5)
quantiles <- round(quantile(spacers$Doench2016, c(.33, .66, 1)), 2)
quantiles <- round(quantile(spacers2$Doench2016, c(.33, .66, 1)), 2)
spacers$efficiency <- spacers$Doench2016 %>% cut(c(0, quantiles), labels = as.character(quantiles))
p <- plot_intervals(spacers, size_var = 'efficiency', alpha_var = 'specific') +
ggplot2::scale_color_manual(values = c(T0050 = "#00BFC4")) +
ggplot2::scale_size_manual(values = c(0.2, 1.5, 3) %>% set_names(quantiles)) +
ggplot2::scale_alpha_manual(values = c(`TRUE`=1, `FALSE`=0.25)) +
ggplot2::guides(color = FALSE, alpha = FALSE, size = FALSE)
ggplot2::ggsave('graphs/srf_efficient.png', p, width = 2.5, height = 2)
ggplot2::ggsave('graphs/srf05_efficient.pdf', p, width = 2.2, height = 1.5, device = grDevices::cairo_pdf)
#ggplot2::ggsave('graphs/srf05_efficient.png', p, width = 2.2, height = 1.5)
# PE
#=====
bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
gr <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp
HBB = 'chr11:5227002:-', # snp
HEXA = 'chr15:72346580-72346583:-', # del
CFTR = 'chr7:117559593-117559595:+'), # ins
bsgenome)
# gr <- char_to_granges(c(PRNP = 'chr20:4699600:+', # snp
# HBB = 'chr11:5227002:-', # snp
# HEXA = 'chr15:72346580-72346583:-', # del
# CFTR = 'chr7:117559593-117559595:+'), # ins
# bsgenome)
gr <- char_to_granges(c(HBB = 'chr11:5227002:-'), bsgenome)
plot_intervals(gr, facet_var = c('seqnames', 'targetname'))
spacers <- gr %>% find_pe_spacers(bsgenome, nrt=48) %>% add_genome_counts()
spacers$specific <- spacers$G0==1
spacers %>% add_efficiency(bsgenome, 'Doench2016', condaenv = 'azienv')
spacers %>% add_efficiency(bsgenome, 'Doench2016')
# Select HBB
gr %<>% extract('HBB')
p <- plot_intervals(gr, facet_var = c('seqnames', 'targetname')) +
ggplot2::guides(color = FALSE)
ggplot2::ggsave('graphs/hbb.png', p, width=2.2, height=1.8)
ggplot2::ggsave('graphs/hbb01.pdf', p, width=2.2, height=1.8, device = grDevices::cairo_pdf)
extended <- extend_for_pe(gr, nrt = 48)
p <- plot_intervals(extended, facet_var = c('seqnames', 'targetname')) +
ggplot2::guides(color = FALSE)
ggplot2::ggsave('graphs/hbb_extended.png', p, width=2.1, height=1.8)
ggplot2::ggsave('graphs/hbb02_extended.pdf', p, width=2.1, height=1.8, device = grDevices::cairo_pdf)
spacers <- gr %>% find_pe_spacers(bsgenome, nrt=48)
p <- plot_intervals(spacers, facet_var = c('seqnames', 'targetname')) +
ggplot2::guides(color = FALSE, size = FALSE)
ggplot2::ggsave('graphs/hbb_spacers.png', p, width=2.2, height=1.8)
ggplot2::ggsave('graphs/hbb03_spacers.pdf', p, width=2.2, height=1.8, device = grDevices::cairo_pdf)
spacers %<>% add_genome_counts()
spacers$specific <- spacers$G0==1
......@@ -101,9 +115,9 @@ p <- plot_intervals(spacers, facet_var = c('seqnames', 'targetname'),
alpha_var = 'specific') +
ggplot2::scale_alpha_manual(values = c(`TRUE` = 1, `FALSE` = 0.25)) +
ggplot2::guides(color = FALSE, size = FALSE, alpha = FALSE)
ggplot2::ggsave('graphs/hbb_specific.png', p, width=2.2, height=1.8)
ggplot2::ggsave('graphs/hbb04_specific.pdf', p, width=2.2, height=1.8, device = grDevices::cairo_pdf)
spacers %<>% add_efficiency(bsgenome, 'Doench2016', condaenv = 'azienv')
spacers %<>% add_efficiency(bsgenome, 'Doench2016')
quantiles <- round(quantile(spacers$Doench2016, c(0.33, 0.66, 1)), 2)
spacers$efficiency <- cut(spacers$Doench2016, c(0, quantiles), labels = quantiles) %>%
as.character()
......@@ -112,7 +126,7 @@ p <- plot_intervals(spacers, facet_var = c('seqnames', 'targetname'),
ggplot2::scale_alpha_manual(values = c(`TRUE` = 1, `FALSE` = 0.25)) +
ggplot2::scale_size_manual(values = c(0.2, 1.5, 3) %>% set_names(quantiles)) +
ggplot2::guides(color = FALSE, size = FALSE, alpha = FALSE)
ggplot2::ggsave('graphs/hbb_efficient.png', p, width=2.2, height=1.8)
ggplot2::ggsave('graphs/hbb05_efficient.pdf', p, width=2.2, height=1.8, device = grDevices::cairo_pdf)
gr %<>% extract('HBB')
......
require(magrittr)
require(data.table)
require(multicrispr)
# Shariati paper
# 1. Target oct4 binding site 137-151 nt upstream of Nanog TSS
# 2. Target Utf1 gene
......@@ -11,29 +14,7 @@ ensembldb::columns(ensdb)
ensembldb::select(ensdb, 'Nanog', keytype = 'GENENAME', columns = ensembldb::columns(ensdb))
ensembldb::select(ensdb, 'Nanog', keytype = 'GENENAME', columns = c('SEQNAME', 'SEQSTRAND', 'TXID', 'TXSEQSTART', 'TXCDSSEQSTART', 'TXCDSSEQEND', 'TXSEQEND', 'GENESEQSTART', 'GENESEQEND'))
# Find spacers in oct4 site upstream of nanog tss
targets <- multicrispr::char_to_granges(c(oct4_nanog='chr6:122707565:+'), bsgenome) %>%
multicrispr::up_flank(-151, -137)
targets$targetstart <- start(targets)
targets$targetend <- end(targets)
targets %<>% multicrispr::extend(-22, +22)
bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
pattern <- 'GGTCACCTTACAGCTTCTTTTGCATTACAATGTCCATGGTGG'
BSgenome::getSeq(bsgenome, targets, as.character=TRUE) %>%
stringi::stri_detect_fixed('GGTCACCTTACAGCTTCTTTTGCATTACAATGTCCATGGTGG')
spacers <- multicrispr::find_spacers(targets, bsgenome)
spacers %<>% multicrispr:::add_genome_counts(bsgenome)
reticulate::use_condaenv('azienv')
spacers %<>% multicrispr::add_efficiency(bsgenome, method = 'Doench2016')
# Find spacers in oct4 site downstream of Utf1
ensembldb::select(ensdb, 'Utf1', keytype = 'GENENAME', columns = c('SEQNAME', 'SEQSTRAND', 'TXID', 'TXSEQSTART', 'TXCDSSEQSTART', 'TXCDSSEQEND', 'TXSEQEND', 'GENESEQSTART', 'GENESEQEND'))
targets <- multicrispr::char_to_granges(c(oct4_utf1='chr7:139943789:+'), bsgenome) %>%
multicrispr::down_flank(+1825, +1825)
targets$targetstart <- start(targets)
targets$targetend <- end(targets)
targets %<>% multicrispr::extend(-100, +100)
BSgenome::getSeq(bsgenome, targets) %>%
# Find spacers in oct4 site ups <- <- <- me, targets) %>%
#Biostrings::reverseComplement() %>%
as.character() %>%
stringi::stri_detect_fixed('GTTGTTATGCTAGTGAAGTGC')
require(magrittr)
require(data.table)
require(multicrispr)
bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
x <- multicrispr::char_to_granges(
c( PRNP = 'chr20:4699600:+', # snp
......@@ -16,11 +20,57 @@ spacers['PRNP_4']
# GCAGTGGTGGGGGGCCTTGG # Anzalone
spacers %<>% sort(ignore.strand=TRUE)
spacers$anzalone <- names(spacers) %in% c('HBB_1', 'HEXA_3', 'PRNP_4')
spacers$set <- ifelse(names(spacers) %in% c('HBB_1', 'HEXA_3', 'PRNP_4'), 'anzalone', 'multicrispr')
x <- list(multicrispr = names(subset(spacers, targetname=='HBB')),
anzalone = names(subset(spacers, targetname=='HBB' & set=='anzalone')))
plot_venn <- function(x, title=''){
names(x) %<>% paste0(' (', vapply(x, length, integer(1)), ')')
p <- VennDiagram::venn.diagram(
x,
filename = NULL,
fill = c("#00BFC4", "#F8766D", "#00BA38")[seq_along(x)],
col = c("#00BFC4", "#F8766D", "#00BA38")[seq_along(x)],
cat.pos = rep( 0, length(x)),
cat.dist = rep(-0.1, length(x)),
#cat.col = c(multicrispr = "#00BFC4", anzalone = "#F8766D"),
scaled = FALSE,
main = title,
main.pos = c(0.5, 1.06))
grid::grid.draw(p)
p
}
do_plot_venn <- function(selectedtarget){
grid::grobTree(
plot_venn(
list(multicrispr = names(subset(spacers, targetname==selectedtarget)),
Anzalone = names(subset(spacers, targetname==selectedtarget & set=='anzalone'))),
title = selectedtarget))
}
p1 <- grid::grobTree(do_plot_venn('HBB'))
p2 <- grid::grobTree(do_plot_venn('HEXA'))
p3 <- grid::grobTree(do_plot_venn('PRNP'))
p_pe_venn <- gridExtra::grid.arrange(p1, p2, p3, layout_matrix = matrix(1:3, nrow=1))
grid::grid.draw(p_pe_venn)
multicrispr::plot_intervals(spacers, facet_var = c('seqnames','targetname'),
size_var = 'anzalone')
size_var = 'in_anzalone')
spacers %<>% add_efficiency(bsgenome, 'Doench2016')
ggplot2::ggplot(as.data.table(spacers), ggplot2::aes(x=in_anzalone, y = Doench2016, color=set)) +
ggplot2::geom_point() + ggplot2::facet_wrap(~ targetname) + ggplot2::theme_bw()
spacers %<>% add_genome_counts(bsgenome)
p_pe_on <- ggplot2::ggplot(as.data.table(spacers), ggplot2::aes(y=as.factor(as.character(G0-1)), x = Doench2016, color=set)) +
ggplot2::geom_point(size = 4, shape = 15, alpha=0.6) +
ggplot2::facet_wrap(~ targetname) +
ggplot2::theme_bw() +
ggplot2::ylab('Offtarget matches') +
ggplot2::guides(color = FALSE)
spacers$unique <- spacers$G0==1
reticulate::use_condaenv('azienv')
spacers %<>% add_efficiency(bsgenome, method = 'Doench2016')
......
# Load functions
require(reticulate)
use_condaenv('azienv')
import('azimuth')
require(AnnotationDbi)
require(BSgenome)
require(multicrispr)
require(magrittr)
require(biomaRt)
require(data.table)
require(stringi)
# Load Brunello library
brunello_url <- 'https://www.addgene.org/static/cms/filer_public/8b/4c/8b4c89d9-eac1-44b2-bb2f-8fea95672705/broadgpp-brunello-library-contents.txt'
brunello_file <- '../multicrisprout/brunello/brunello.txt'
#download.file(brunello_url, brunello_file)
brunello <- data.table::fread(brunello_file)
brunello %<>% extract(!is.na(`Target Gene ID`)) # rm pos controls
brunello[, .(N = length(unique(`Target Transcript`))), by = 'Target Gene ID'][, table(N)] # 1 transcr per gene
brunello[, .(N = length(unique(`Exon Number`))), by = 'Target Gene ID'][, table(N)] # 1-4 exons per gene
brunello[, .(N = length(unique(`Exon Number`))), by = 'Target Transcript'][, table(N)] # per transcript
brunello[`Target Gene ID`==1]
brunello$`Target Transcript` %<>% substr(1, nchar(.)-2)
# Add strand
ensmart <- biomaRt::useMart('ensembl', dataset = 'hsapiens_gene_ensembl')
strandt <- getBM(attributes = c('refseq_mrna', 'strand'), filters = 'refseq_mrna', values = unique(brunello$`Target Transcript`), mart = ensmart)
strandt %<>% as.data.table()
strandt %<>% extract(, .SD[.N==1], by = 'refseq_mrna') # rm the ones that don't map uniquely to a strand
brunello %<>% merge.data.table(strandt, by.x = 'Target Transcript', by.y = 'refseq_mrna') # 76 441 -> 75 260
# Add seqnames
brunello$`Target Gene ID`
chromdt <- getBM(attributes = c('refseq_mrna', 'chromosome_name'), filters = 'refseq_mrna', values = unique(brunello$`Target Transcript`), mart = ensmart)
chromdt %<>% as.data.table()
chromdt %<>% extract(, .SD[nchar(chromosome_name) == min(nchar(chromosome_name))], by = 'refseq_mrna')
chromdt[, table(chromosome_name)]
chromdt %<>% extract(, .SD[.N>1], by = 'chromosome_name')
chromdt
brunello %<>% merge.data.table(chromdt, by.x = 'Target Transcript', by.y = 'refseq_mrna') # 76 441 -> 75 260 -> 75 244
# Find spacer ranges
brunello %>% setnames(c('Strand', 'strand', 'chromosome_name'), c('sense', 'numericstrand', 'seqnames'))
brunello[, strand := vapply(numericstrand, multicrispr:::csign, character(1)) ]
brunello[numericstrand==+1 & brunello$sense=='antisense', strand := '-']
brunello[numericstrand==-1 & brunello$sense=='antisense', strand := '+']
brunello %>% setnames(c('Position of Base After Cut (1-based)'), c( 'start'))
brunello %>% extract(, end := start)
brunello$cutsite <- brunello$start
bsgenome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
br <- brunello %>% dt2gr(seqinfo(bsgenome)[standardChromosomes(bsgenome)]) %>% as('GRanges')
br[strand(br)=='+'] %<>% extend(-17, +2)
br[strand(br)=='-'] %<>% extend(-16, +3)
br$crisprspacer <- BSgenome::getSeq(bsgenome, br, as.character=TRUE)
br %<>% subset(`sgRNA Target Sequence`==crisprspacer) # Drop 12 non-matches (due to sequence changes). 75 244 -> 75 232
# Are context seqs identical? Yes!
br$context <- br %>% extend(-4,+6) %>% getSeq(bsgenome, ., as.character=TRUE)
subset(br, context == `Target Context Sequence`)
# find_spacers
br$`Target Gene ID` <- br$`Genomic Sequence` <- br$sense <- br$`sgRNA Target Sequence` <- br$`Target Context Sequence` <- NULL
br$numericstrand <- br$cutsite <- br$`Exon Number` <- br$context <- NULL
names(mcols(br)) %<>% stringi::stri_replace_first_fixed('PAM Sequence', 'crisprpam')
names(mcols(br)) %<>% stringi::stri_replace_first_fixed('Rule Set 2 score', 'Doench2016')
br %<>% sort(ignore.strand=TRUE)
br$targetname <- names(br) <- sprintf('br%05d', seq_along(br))
spacers <- br %>% extend(0, +3) %>% multicrispr::find_spacers(bsgenome, plot = FALSE)
# All brunello spacers are found
# 4 813 additional spacers are found on rev strands
spcoords <- unname(as.character(granges(spacers)))
brcoords <- unname(as.character(granges(br)))
setdiff(brcoords, spcoords) %>% length()
setdiff(spcoords, brcoords) %>% length()
autonomics.plot::plot_venn(list(multicrispr = spcoords, brunello = brcoords))
brunello_spacers <- spacers[spcoords %in% brcoords]
names(mcols(brunello_spacers)) %<>% stringi::stri_replace_first_fixed('Doench2016', 'BrunelloDoench2016')
brunello_spacers[1:10] %>% add_efficiency(bsgenome, 'Doench2016', plot = FALSE)
brunello_spacers %<>% add_efficiency(bsgenome, 'Doench2016', plot = FALSE)
plot(brunello_spacers$BrunelloDoench2016, brunello_spacers$Doench2016)
plot(density(brunello_spacers$Doench2016))
# Find overlapping exons
ensdb <- multicrispr::EnsDb.Hsapiens.v98()
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
exons1 <- ensembldb::exons(ensdb)
#exons1 <- exons(txdb)
chromosomes <- setdiff(standardChromosomes(exons1), 'MT')
exons1 %<>% subset(seqnames(exons1) %in% chromosomes)
exons1 %<>% gr2dt() %>% dt2gr(seqinfo(exons1)[chromosomes])
br %<>% gr2dt() %>% dt2gr(seqinfo(br)[chromosomes])
res <- findOverlaps(exons1, extend(br, 0, +3), ignore.strand = TRUE, minoverlap = 23)
exons1 %<>% extract(queryHits(res))
exons1$targetname <- names(br)[subjectHits(res)]
length(unique(exons1$targetname))
br %<>% extract(unique(subjectHits(res)))
length(br)
exonsdt <- gr2dt(exons1)
exonsdt[, length(unique(targetname))]
exonsdt %<>% extract(, .SD[width == min(width)], by = 'targetname') # 75 229 /
exonsdt %<>% extract(, .SD[1], by = 'targetname')
exons1 <- exonsdt %>% dt2gr(seqinfo(exons1))
all(start(exons1) <= start(extend(br, 0, +3)))
all( end(exons1) >= end(extend(br, 0, +3)))
exons1
br
exonspacers <- find_spacers(exons1, bsgenome, plot = FALSE) # 2 552 511
brunellocoords <- unname(as.character(granges(br)))
exonspacercoords <- unname(as.character(granges(exonspacers)))
autonomics.plot::plot_venn(list(brunello = brunellocoords, multicrispr = exonspacercoords))
exonspacers$in_brunello <- exonspacercoords %in% brunellocoords
exonspacers %>% subset(in_brunello==TRUE)
exonspacers %<>% add_context(bsgenome)
exonspacers %<>% add_efficiency(bsgenome, 'Doench2016')
contextseqs <- exonspacers$crisprcontext[1:1000]
score_doench2016_parallel <- function(contextseqs, chunksize = 100){
nchunks <- ceiling(length(contextseqs) / chunksize)
scores<-BiocParallel::bplapply(
seq_len(nchunks),
function(i){
chunkstart <- (i-1)*chunksize + 1
chunkend <- min(i*chunksize, length(contextseqs))
multicrispr:::doench2016(contextseqs[chunkstart:chunkend])
}) %>%
unlist()
}
parallel::mclapply()
exonspacers$Doench2016 <- 0
exonspacers$Doench2016[ 1:100000] <- multicrispr:::doench2016(exonspacers$crisprcontext[ 1:100000])
exonspacers$Doench2016[100001:200000] <- multicrispr:::doench2016(exonspacers$crisprcontext[100001:200000])
tmp <- exonspacers[1:100000]
tmp %<>% add_efficiency(bsgenome, 'Doench2016')
# add_genome_counts
bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
seqlevelsStyle(br) <- 'UCSC'
br %<>% add_genome_counts(bsgenome, outdir = '~/multicrisprout/brunello')
length(unique(br$G0))
table(br$G0)
barplot(table(br$G0), log = "y", xlab = '# Exact genome matches')
brunello # 76 065
spacers # 4 374 614
spacers %<>% add_genome_counts( # works (also) on vm!
mismatches=1,
indexedgenomesdir = '../../multicrisprout/indexedgenomes',
outdir = '../../multicrisprout')
spacerdt <- as.data.table(spacers)
seqinfo(exons1) <- seqinfo(bsgenome)
bsgenome
columns(tx)
select(tx, keys='1', columns = c('CDSCHROM', 'CDSSTRAND'), keytype = 'GENEID')
getSeq(BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38, 'chr19', 58351502-3, 58351502+16, strand='-', as.character = TRUE) %>%
equals(brunello[1]$`sgRNA Target Sequence`)
# one transcript per gene is hit
# Keep the part of brunello that tallies with current ensdb
brunello %>% setkey('Target Gene ID')
brunello[unique(keys(ensdb, keytype = 'ENTREZID'))]
# Map RefSeq mRNA ids to ensembl transcript ids (which can then be mapped to exon rank)
'NM_130786.3'
ensmart <- useMart('ensembl')
listDatasets(ensmart) %>% subset(dataset %>% stringi::stri_detect_fixed('hsapiens'))
ensmart <- useMart('ensembl', dataset = 'hsapiens_gene_ensembl')
listAttributes(ensmart)
listAttributes(ensmart) %>% extract2('name') %>% extract(stringi::stri_detect_fixed(., 'exon'))
values <- brunello[, `Target Transcript` %>% tstrsplit('[.]') %>% extract2(1)] %>% unique() %>% na.exclude()
bmdt <- getBM(attributes = c('refseq_mrna', 'ensembl_transcript_id'), filters = 'refseq_mrna', values = values, mart = ensmart)
bmdt %<>% as.data.table()
bmdt[, .(N=length(unique(ensembl_transcript_id))), by = 'refseq_mrna'][, table(N)]
brunello$`Target Gene ID`
brunello %<>% extract(!is.na(`Target Gene ID`))
keys1 <- unique(as.character(brunello$`Target Gene ID`))
keys1 %<>% extract(!is.na(keys1))
# Map chr using txdb
txgenes <- select(tx, keys=,keys1, columns = c('CDSCHROM'), keytype = 'GENEID')
txgenes %<>% as.data.table()
txgenes %<>% extract(, .SD[nchar(CDSCHROM)==min(nchar(CDSCHROM))], by = 'GENEID')
txgenes[, .SD[.N>1], by = 'GENEID']
setdiff(keys1, txgenes$GENEID)
# Map chr using ensdb
keytypes(ensdb)
columns(ensdb)
ensgenes <- select(ensdb, keys = keys1, keytype='ENTREZID', columns = c('SEQNAME'))
ensgenes %<>% as.data.table()
ensgenes %<>% extract(, .SD[nchar(SEQNAME)==min(nchar(SEQNAME))], by = 'ENTREZID')
ensgenes[, .SD[length(unique(SEQNAME))>1], by = 'ENTREZID']
ensgenes %<>% extract(!(ENTREZID==266 & SEQNAME=='Y'))
ensgenes[, .SD[length(unique(SEQNAME))>1], by = 'ENTREZID']
ensgenes[, table(SEQNAME)]
ensgenes[, SEQNAME := paste0('chr', SEQNAME)]
ensgenes[, .SD[.N>1], by = 'ENTREZID']
setdiff(keys1, ensgenes$ENTREZID)
keys2 <- unique(as.character(brunello$`Target Gene Symbol`))
ensgenenames <- select(ensdb, keys = keys2, keytype='SYMBOL', columns = c('SEQNAME'))
setdiff(keys2, ensgenenames$SYMBOL)
brunello %>% merge(tmp, by.x = 'Target Gene ID', by.y = 'ENTREZID', sort = FALSE, all.x = TRUE)
tmp %<>% as.data.table()
tmp %<>% extract(!is.na(CDSCHROM))
tmp %<>% extract(, .SD[nchar(CDSCHROM) == min(nchar(CDSCHROM))], by = 'GENEID')
tmp %>% setnames(c('CDSCHROM', 'CDSSTRAND'), c('chrom', 'strand'))
tmp[, .N, by = 'GENEID'][N>1]
brunello %>% merge(tmp, by.x = 'Target Gene ID', by.y = 'GENEID', sort = FALSE, all.x = TRUE)
exons
tx
columns(tx)
columns(ensdb)
select(tx, keys='1', columns = c("TXCHROM", "TXEND", "TXID", "TXNAME", "TXSTART", "TXSTRAND", "TXTYPE"), keytype = 'GENEID')
select(tx, keys='1', columns = c("TXID", "EXONCHROM","EXONEND", "EXONID", "EXONNAME", "EXONRANK", "EXONSTART", "EXONSTRAND"), keytype = 'GENEID')
select(tx, keys='1', columns = c("TXCHROM", "TXEND", "TXID", "TXNAME", "TXSTART", "TXSTRAND", "TXTYPE"), keytype = 'GENEID')
ensdb
exons <- ensembldb::exons(ensdb)
txexons <- exons(tx) # 687K
brunello$`Exon Number`
txexons$exon_id
exons # 830K
bs <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
gr <- char_to_granges(c(A1BG= 'chr19:58351502:-'), bs)
gr %<>% extend(-16, +3)
getSeq(bs, gr)
brunello[1]$`sgRNA Target Sequence`
# Load functions
require(reticulate)
use_condaenv('azienv')
import('azimuth')
# require(AnnotationDbi)
# require(BSgenome)
# require(multicrispr)
# require(magrittr)
# require(biomaRt)
# require(data.table)
# require(stringi)
# Reconstruct Brunello spacers . Transform into exons
brunello <- multicrispr:::reconstruct_brunello_spacers()
brunello_spacers <- multicrispr:::validate_brunello_spacers(brunello)
brunello_exons <- multicrispr:::find_enclosing_exons(brunello)
brunello %<>% extract(brunello_exons$targetname)
# Validate
bsgenome <- BSgenome.Hsapiens.NCBI.GRCh38::BSgenome.Hsapiens.NCBI.GRCh38
exonspacers <- find_spacers(brunello_exons, bsgenome, plot = FALSE) # 2 583 378
exonspacers %<>% add_efficiency(bsgenome, 'Doench2016', chunksize=100000, plot = FALSE)
bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
seqlevelsStyle(exonspacers) <- 'UCSC'
exonspacers %<>% add_genome_counts(
bsgenome, outdir = '../multicrisprout/brunello',
indexedgenomesdir = '../multicrisprout/indexedgenomes',
mismatches = 1)
seqlevelsStyle(exonspacers) <- 'NCBI'
#saveRDS(exonspacers, '../multicrisprout/brunello/exonspacers.rds')