## ----style, echo = FALSE, results = 'asis'--------------------------------------------------------
BiocStyle::markdown()
options(width=100, max.print=1000)
knitr::opts_chunk$set(
eval=as.logical(Sys.getenv("KNITR_EVAL", "TRUE")),
cache=as.logical(Sys.getenv("KNITR_CACHE", "TRUE")))
## ----setup, echo=FALSE, messages=FALSE, warnings=FALSE--------------------------------------------
suppressPackageStartupMessages({
library(Biostrings)
library(GenomicRanges)
library(GenomicFiles)
library(BiocParallel)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
})
## ----iranges--------------------------------------------------------------------------------------
library(IRanges)
ir <- IRanges(start=c(10, 20, 30), width=5)
ir
## ----iranges-flank--------------------------------------------------------------------------------
flank(ir, 3)
## ----iranges-class--------------------------------------------------------------------------------
class(ir)
getClass(class(ir))
## ----iranges-flank-method, eval=FALSE-------------------------------------------------------------
# ?"flank,Ranges-method"
## ----granges--------------------------------------------------------------------------------------
library(GenomicRanges)
gr <- GRanges(c("chr1", "chr1", "chr2"), ir, strand=c("+", "-", "+"))
gr
## ----granges-flank--------------------------------------------------------------------------------
flank(gr, 3)
## ----granges-class--------------------------------------------------------------------------------
class(gr)
getClass(class(gr))
## ----granges-flank-method, eval=FALSE-------------------------------------------------------------
# ?"flank,GenomicRanges-method"
## ----granges-methods------------------------------------------------------------------------------
methods(class="GRanges")
## ----granges-man-and-vignettes, eval=FALSE--------------------------------------------------------
# help(package="GenomicRanges")
# vignette(package="GenomicRanges")
# vignette(package="GenomicRanges", "GenomicRangesHOWTOs")
## ----txdb-----------------------------------------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
## ----txdb-exons-----------------------------------------------------------------------------------
exons(txdb)
## ----txdb-exonsby---------------------------------------------------------------------------------
exonsBy(txdb, "tx")
## ----BSgenome-require, message=FALSE--------------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)
chr14_range = GRanges("chr14", IRanges(1, seqlengths(Hsapiens)["chr14"]))
chr14_dna <- getSeq(Hsapiens, chr14_range)
letterFrequency(chr14_dna, "GC", as.prob=TRUE)
## ----bam-require----------------------------------------------------------------------------------
library(GenomicRanges)
library(GenomicAlignments)
library(Rsamtools)
## our 'region of interest'
roi <- GRanges("chr14", IRanges(19653773, width=1))
## sample data
library('RNAseqData.HNRNPC.bam.chr14')
bf <- BamFile(RNAseqData.HNRNPC.bam.chr14_BAMFILES[[1]], asMates=TRUE)
## alignments, junctions, overlapping our roi
paln <- readGAlignmentsList(bf)
j <- summarizeJunctions(paln, with.revmap=TRUE)
j_overlap <- j[j %over% roi]
## supporting reads
paln[j_overlap$revmap[[1]]]
## ----vcf, message=FALSE---------------------------------------------------------------------------
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
TxDb.Hsapiens.UCSC.hg19.knownGene,
CodingVariants())
head(coding)
## ----genomicalignments----------------------------------------------------------------------------
## example BAM data
library(RNAseqData.HNRNPC.bam.chr14)
## one BAM file
fl <- RNAseqData.HNRNPC.bam.chr14_BAMFILES[1]
## Let R know that this is a BAM file, not just a character vector
library(Rsamtools)
bfl <- BamFile(fl)
## ----readgalignments------------------------------------------------------------------------------
aln <- readGAlignments(bfl)
aln
## ----galignments-methods--------------------------------------------------------------------------
methods(class=class(aln))
## ----iteration------------------------------------------------------------------------------------
library(GenomicFiles)
yield <- function(bfl) {
## input a chunk of alignments
library(GenomicAlignments)
readGAlignments(bfl, param=ScanBamParam(what="seq"))
}
map <- function(aln) {
## Count G or C nucleotides per read
library(Biostrings)
gc <- letterFrequency(mcols(aln)$seq, "GC")
## Summarize number of reads with 0, 1, ... G or C nucleotides
tabulate(1 + gc, 73) # max. read length: 72
}
reduce <- `+`
## ----iteration-doit-------------------------------------------------------------------------------
library(RNAseqData.HNRNPC.bam.chr14)
fls <- RNAseqData.HNRNPC.bam.chr14_BAMFILES
bf <- BamFile(fls[1], yieldSize=100000)
gc <- reduceByYield(bf, yield, map, reduce)
plot(gc, type="h",
xlab="GC Content per Aligned Read", ylab="Number of Reads")
## ----parallel-doit--------------------------------------------------------------------------------
library(BiocParallel)
gc <- bplapply(BamFileList(fls), reduceByYield, yield, map, reduce)
library(ggplot2)
df <- stack(as.data.frame(lapply(gc, cumsum)))
df$GC <- 0:72
ggplot(df, aes(x=GC, y=values)) + geom_line(aes(colour=ind)) +
xlab("Number of GC Nucleotides per Read") +
ylab("Number of Reads")