Skip to content
Snippets Groups Projects
Commit b2619ace authored by Rene Wiegandt's avatar Rene Wiegandt
Browse files

04_bed_to_granges.R: Filtering range duplicates

parent 8cc4d37e
No related branches found
No related tags found
No related merge requests found
Pipeline #246544 passed
#=============================================================================
#============================================================================= # bed_to_granges
# bed_to_granges #=============================================================================
#=============================================================================
annotate_granges <- function(gr, txdb){
annotate_granges <- function(gr, txdb){
# Assert. Import. Comply.
# Assert. Import. Comply. assert_is_all_of(gr, 'GRanges')
assert_is_all_of(gr, 'GRanges') assert_is_all_of(txdb, 'TxDb')
assert_is_all_of(txdb, 'TxDb') gene_id <- NULL
gene_id <- NULL
# Align seqlevelStyle if required
# Align seqlevelStyle if required if (seqlevelsStyle(gr) != seqlevelsStyle(txdb)){
if (seqlevelsStyle(gr) != seqlevelsStyle(txdb)){ message("Setting seqlevelsStyle(txdb) <- seqlevelsStyle(gr)")
message("Setting seqlevelsStyle(txdb) <- seqlevelsStyle(gr)") seqlevelsStyle(txdb) <- seqlevelsStyle(gr)
seqlevelsStyle(txdb) <- seqlevelsStyle(gr) }
}
# Drop seqinfo (to overlap smoothly)
# Drop seqinfo (to overlap smoothly) txranges <- GenomicFeatures::genes(txdb) %>%
txranges <- GenomicFeatures::genes(txdb) %>% as.data.table() %>%
as.data.table() %>% extract(seqlevelsInUse(gr), on = 'seqnames') %>%
extract(seqlevelsInUse(gr), on = 'seqnames') %>% extract(, c('seqnames', 'start', 'end', 'strand', 'gene_id'),
extract(, c('seqnames', 'start', 'end', 'strand', 'gene_id'), with = FALSE) %>%
with = FALSE) %>% as('GRanges')
as('GRanges')
# Overlap
# Overlap granno <- as.data.table(gr) %>%
granno <- as.data.table(gr) %>% as('GRanges') %>%
as('GRanges') %>% plyranges::join_overlap_left(txranges) %>%
plyranges::join_overlap_left(txranges) %>% as.data.table() %>%
as.data.table() %>% extract(!is.na(gene_id) ,
extract(!is.na(gene_id) , gene_id := paste0(gene_id, collapse = ';'),
gene_id := paste0(gene_id, collapse = ';'), by = c('seqnames', 'start', 'end', 'strand')) %>%
by = c('seqnames', 'start', 'end', 'strand')) %>% unique() %>%
unique() %>% as('GRanges')
as('GRanges') seqlevels(granno) %<>% setdiff('.') # patch plyranges bug
seqlevels(granno) %<>% setdiff('.') # patch plyranges bug seqinfo(granno) <- seqinfo(gr)
seqinfo(granno) <- seqinfo(gr) granno
granno }
}
#' Read bedfile into GRanges
#' Read bedfile into GRanges #' @param bedfile file path
#' @param bedfile file path #' @param genome string: UCSC genome name (e.g. 'mm10')
#' @param genome string: UCSC genome name (e.g. 'mm10') #' @param txdb NULL (default) or \code{\link[GenomicFeatures]{TxDb-class}}
#' @param txdb NULL (default) or \code{\link[GenomicFeatures]{TxDb-class}} #' (used for gene annotation)
#' (used for gene annotation) #' @param do_order TRUE (default) or FALSE: order on seqnames and star?
#' @param do_order TRUE (default) or FALSE: order on seqnames and star? #' @param plot TRUE (default) or FALSE: plot karyogram?
#' @param plot TRUE (default) or FALSE: plot karyogram? #' @param verbose TRUE (default) or FALSE
#' @param verbose TRUE (default) or FALSE #' @return \code{\link[GenomicRanges]{GRanges-class}}
#' @return \code{\link[GenomicRanges]{GRanges-class}} #' @seealso \code{\link{char_to_granges}}, \code{\link{genes_to_granges}}
#' @seealso \code{\link{char_to_granges}}, \code{\link{genes_to_granges}} #' @examples
#' @examples #' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr')
#' bedfile <- system.file('extdata/SRF.bed', package = 'multicrispr') #' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#' bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10 #' (gr <- bed_to_granges(bedfile, genome='mm10'))
#' (gr <- bed_to_granges(bedfile, genome='mm10')) #' @export
#' @export bed_to_granges <- function(
bed_to_granges <- function( bedfile,
bedfile, genome,
genome, txdb = NULL,
txdb = NULL, do_order = TRUE,
do_order = TRUE, plot = TRUE,
plot = TRUE, verbose = TRUE
verbose = TRUE ){
){ . <- NULL
. <- NULL
# Assert
# Assert assert_all_are_existing_files(bedfile)
assert_all_are_existing_files(bedfile) if (!is.null(txdb)) assert_is_all_of(txdb, 'TxDb')
if (!is.null(txdb)) assert_is_all_of(txdb, 'TxDb') assert_is_a_bool(do_order)
assert_is_a_bool(do_order) assert_is_a_bool(plot)
assert_is_a_bool(plot) assert_is_a_bool(verbose)
assert_is_a_bool(verbose)
# Read
# Read if (verbose) cmessage('\tRead %s into GRanges', basename(bedfile))
if (verbose) cmessage('\tRead %s into GRanges', basename(bedfile)) gr <- rtracklayer::import.bed(bedfile, genome = genome)
gr <- rtracklayer::import.bed(bedfile, genome = genome) if (verbose) cmessage('\t\t%d ranges on %d chromosomes',
if (verbose) cmessage('\t\t%d ranges on %d chromosomes', length(gr), length(unique(seqnames(gr))))
length(gr), length(unique(seqnames(gr))))
# Annotate
# Annotate if (!is.null(txdb)){
if (!is.null(txdb)){ cmessage('\t\tAnnotate with txdb')
cmessage('\t\tAnnotate with txdb') gr %<>% annotate_granges(txdb)
gr %<>% annotate_granges(txdb) }
}
# Plot
# Plot genome1 <- unique(genome(gr))
genome1 <- unique(genome(gr)) assert_is_scalar(genome1)
assert_is_scalar(genome1) title <- paste0(genome1, ': ', basename(bedfile))
title <- paste0(genome1, ': ', basename(bedfile)) if (plot) plot_karyogram(gr, title)
if (plot) plot_karyogram(gr, title)
# Order
# Order if (do_order) gr %<>% sort(ignore.strand = TRUE)
if (do_order) gr %<>% sort(ignore.strand = TRUE) #%<>% extract( order(seqnames(.), start(.)))
#%<>% extract( order(seqnames(.), start(.)))
# Filter duplicates
# Record gr <- unique(gr)
names(gr) <- gr$targetname <- make_unique_names(gr, 'T')
gr$targetstart <- GenomicRanges::start(gr) # Record
gr$targetend <- GenomicRanges::end(gr) names(gr) <- gr$targetname <- make_unique_names(gr, 'T')
gr$targetstart <- GenomicRanges::start(gr)
# Return gr$targetend <- GenomicRanges::end(gr)
gr
} # Return
gr
}
#' Convert character vector into GRanges
#' @param x character vector
#' @param bsgenome \code{\link[BSgenome]{BSgenome-class}} #' Convert character vector into GRanges
#' @return \code{\link[GenomicRanges]{GRanges-class}} #' @param x character vector
#' @examples #' @param bsgenome \code{\link[BSgenome]{BSgenome-class}}
#' require(magrittr) #' @return \code{\link[GenomicRanges]{GRanges-class}}
#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38 #' @examples
#' x <- c(PRNP = 'chr20:4699600:+', # snp #' require(magrittr)
#' HBB = 'chr11:5227002:-', # snp #' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
#' HEXA = 'chr15:72346580-72346583:-', # del #' x <- c(PRNP = 'chr20:4699600:+', # snp
#' CFTR = 'chr7:117559593-117559595:+') # ins #' HBB = 'chr11:5227002:-', # snp
#' gr <- char_to_granges(x, bsgenome) #' HEXA = 'chr15:72346580-72346583:-', # del
#' plot_intervals(gr, facet_var = c('targetname', 'seqnames')) #' CFTR = 'chr7:117559593-117559595:+') # ins
#' @seealso \code{\link{bed_to_granges}}, \code{\link{genes_to_granges}} #' gr <- char_to_granges(x, bsgenome)
#' @export #' plot_intervals(gr, facet_var = c('targetname', 'seqnames'))
char_to_granges <- function(x, bsgenome){ #' @seealso \code{\link{bed_to_granges}}, \code{\link{genes_to_granges}}
gr <- GenomicRanges::GRanges(x, seqinfo = BSgenome::seqinfo(bsgenome)) #' @export
names(gr) <- gr$targetname <- make_unique_names(gr, 'T') char_to_granges <- function(x, bsgenome){
gr$targetstart <- GenomicRanges::start(gr) gr <- GenomicRanges::GRanges(x, seqinfo = BSgenome::seqinfo(bsgenome))
gr$targetend <- GenomicRanges::end(gr) names(gr) <- gr$targetname <- make_unique_names(gr, 'T')
gr gr$targetstart <- GenomicRanges::start(gr)
} gr$targetend <- GenomicRanges::end(gr)
gr
}
#' Convert geneids into GRanges
#' @param file Gene identifier file (one per row)
#' @param geneids Gene identifier vector #' Convert geneids into GRanges
#' @param complement TRUE (default) or FALSE: add complementary strand? #' @param file Gene identifier file (one per row)
#' @param txdb \code{\link[GenomicFeatures]{TxDb-class}} or #' @param geneids Gene identifier vector
#' \code{\link[ensembldb]{EnsDb-class}} #' @param complement TRUE (default) or FALSE: add complementary strand?
#' @param plot TRUE (default) or FALSE #' @param txdb \code{\link[GenomicFeatures]{TxDb-class}} or
#' @param verbose TRUE (default) or FALSE #' \code{\link[ensembldb]{EnsDb-class}}
#' @return \code{\link[GenomicRanges]{GRanges-class}} #' @param plot TRUE (default) or FALSE
#' @seealso \code{\link{char_to_granges}}, \code{\link{bed_to_granges}} #' @param verbose TRUE (default) or FALSE
#' @examples #' @return \code{\link[GenomicRanges]{GRanges-class}}
#' # Entrez #' @seealso \code{\link{char_to_granges}}, \code{\link{bed_to_granges}}
#' #------- #' @examples
#' genefile <- system.file('extdata/SRF.entrez', package='multicrispr') #' # Entrez
#' geneids <- as.character(read.table(genefile)[[1]]) #' #-------
#' txdb <- getFromNamespace('TxDb.Mmusculus.UCSC.mm10.knownGene', #' genefile <- system.file('extdata/SRF.entrez', package='multicrispr')
#' 'TxDb.Mmusculus.UCSC.mm10.knownGene') #' geneids <- as.character(read.table(genefile)[[1]])
#' (gr <- genes_to_granges(geneids, txdb)) #' txdb <- getFromNamespace('TxDb.Mmusculus.UCSC.mm10.knownGene',
#' (gr <- genefile_to_granges(genefile, txdb)) #' 'TxDb.Mmusculus.UCSC.mm10.knownGene')
#' #' (gr <- genes_to_granges(geneids, txdb))
#' # Ensembl #' (gr <- genefile_to_granges(genefile, txdb))
#' #-------- #'
#' # txdb <- AnnotationHub::AnnotationHub()[["AH75036"]] #' # Ensembl
#' # genefile <- system.file('extdata/SRF.ensembl', package='multicrispr') #' #--------
#' # geneids <- as.character(read.table(genefile)[[1]]) #' # txdb <- AnnotationHub::AnnotationHub()[["AH75036"]]
#' # (gr <- genes_to_granges(geneids, txdb)) #' # genefile <- system.file('extdata/SRF.ensembl', package='multicrispr')
#' # (gr <- genefile_to_granges(genefile, txdb)) #' # geneids <- as.character(read.table(genefile)[[1]])
#' @export #' # (gr <- genes_to_granges(geneids, txdb))
genes_to_granges <- function( #' # (gr <- genefile_to_granges(genefile, txdb))
geneids, #' @export
txdb, genes_to_granges <- function(
complement = TRUE, geneids,
plot = TRUE, txdb,
verbose = TRUE complement = TRUE,
){ plot = TRUE,
verbose = TRUE
# Assert ){
assert_is_character(geneids)
assert_is_any_of(txdb, c('TxDb', 'EnsDb')) # Assert
assert_is_a_bool(complement) assert_is_character(geneids)
assert_is_a_bool(plot) assert_is_any_of(txdb, c('TxDb', 'EnsDb'))
assert_is_a_bool(complement)
# Convert assert_is_a_bool(plot)
gr <- GenomicFeatures::genes(txdb)[geneids]
if (verbose) cmessage('\t\tConvert %d genes to %d GRanges', # Convert
length(geneids), length(gr)) gr <- GenomicFeatures::genes(txdb)[geneids]
if (verbose) cmessage('\t\tConvert %d genes to %d GRanges',
# Add complementary strand length(geneids), length(gr))
if (complement){
gr %<>% add_inverse_strand(plot = FALSE, verbose = verbose) # Add complementary strand
} if (complement){
gr %<>% add_inverse_strand(plot = FALSE, verbose = verbose)
# Plot }
if (plot) plot_karyogram(gr)
# Plot
# Record if (plot) plot_karyogram(gr)
names(gr) <- gr$targetname <- make_unique_names(gr, 'T')
gr$targetstart <- GenomicRanges::start(gr) # Record
gr$targetend <- GenomicRanges::end(gr) names(gr) <- gr$targetname <- make_unique_names(gr, 'T')
gr$targetstart <- GenomicRanges::start(gr)
# Return gr$targetend <- GenomicRanges::end(gr)
gr
} # Return
gr
}
#' @rdname genes_to_granges
#' @export
genefile_to_granges <- function(file, txdb, complement = TRUE, plot = TRUE){ #' @rdname genes_to_granges
assert_all_are_existing_files(file) #' @export
geneids <- read.table(file)[[1]] %>% as.character() genefile_to_granges <- function(file, txdb, complement = TRUE, plot = TRUE){
genes_to_granges(geneids, txdb, complement = complement, plot = plot) assert_all_are_existing_files(file)
} geneids <- read.table(file)[[1]] %>% as.character()
genes_to_granges(geneids, txdb, complement = complement, plot = plot)
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment