%\VignetteEngine{knitr::knitr}
%% \VignetteIndexEntry{01 Project Overview -- Slides}
\documentclass[xcolor=dvipsnames]{beamer}
\usepackage{BioconductorSlides}
\hypersetup{colorlinks,linkcolor=,urlcolor=Blue}
\AtBeginSection[]
{
\begin{frame}{Outline}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\title{R / Bioconductor for Analysis and
Comprehension of High-Throughput Sequence Data}
\author{Martin T. Morgan (\url{mtmorgan@fhcrc.org}) \\
Fred Hutchinson Cancer Research Center \\
Seattle, WA, USA}
\date{3 February 2014}
\maketitle
\section*{Introduction}
\begin{frame}{Overview}
\begin{enumerate}
\item Introduction to \R{} and \Bioconductor{}
\item Sequencing work flows
\item Successful computational biology software
\item Exemplars: algorithms into actions
\item Challenges \& opportunities
\end{enumerate}
\end{frame}
\begin{frame}{Introduction: \Bioconductor}
Analysis and comprehension of high-throughput genomic data
\begin{itemize}
\item \url{http://bioconductor.org}
\item $>11$ years old, $749$ packages
\end{itemize}
Themes
\begin{itemize}
\item Rigorous statistics
\item Reproducible work flows
\item Integrative analysis
\end{itemize}
\end{frame}
\begin{frame}{Introduction: \Bioconductor}
\begin{columns}
\column{.5\textwidth}
\begin{itemize}
\item 1341 PubMed full-text citations in trailing 12 months
\item 28,000 web visits / month; 75,000 unique IP downloads / year
\item Annual conferences; courses; active mailing list; \ldots
\end{itemize}
\column{.5\textwidth}
\includegraphics[width=\textwidth]{figures/usage-geography}
\end{columns}
\bigskip\par
\textbf{\Bioconductor{} Conference}, July 30 - Aug 1, Boston, USA
\end{frame}
\begin{frame}{Introduction: What is \Bioconductor{} good for?}
\begin{itemize}
\item Microarrays: expression, copy number, SNPs, methylation, \ldots
\item Sequencing: RNA-seq, ChIP-seq, called variants, \ldots
\begin{itemize}
\item Especially \emph{after} assembly / alignment
\end{itemize}
\item Annotation: genes, pathways, gene models (exons, transcripts,
etc.), \ldots
\item Flow cytometry, proteomics, image analysis, high-throughput
screens, \ldots
\end{itemize}
\end{frame}
\section*{Sequencing work flows}
\begin{frame}[fragile]{Introduction: \R}
\begin{columns}
\column{.45\textwidth}
\begin{itemize}
\item \url{http://r-project.org}
\item Open-source, statistical programming language; widely used in
academia, finance, pharma, \ldots
\item Core language and base packages
\item Interactive sessions, scripts
\item $>5000$ contributed packages
\end{itemize}
\column{.55\textwidth}
<>=
## Two 'vectors'
x <- rnorm(1000)
y <- x + rnorm(1000, sd=.5)
## Integrated container
df <- data.frame(X=x, Y=y)
## Visualize
plot(Y ~ X, df)
## Regression; 'object'
fit <- lm(Y ~ X, df)
## Methods on the object
abline(fit) # regression line
anova(fit) # ANOVA table
@
\end{columns}
\end{frame}
\begin{frame}[fragile]{Sequencing: Work flows}
\begin{columns}
\column{.5\textwidth}
\begin{enumerate}
\item Experimental design
\item `Wet lab' sample prep
\item Sequencing
\begin{itemize}
\item 100's of millions of reads
\item 30-150 nucleotides
\item Single and paired-end
\item Bar codes, lanes \& flow cells
\end{itemize}
\item Alignment
\item Analysis: DNA, RNA, epigenetics, integrative, microbiome,
\ldots
\end{enumerate}
\column{.5\textwidth}
\includegraphics[width=\textwidth,height=!]{figures/Solexa-bridge-pcr.jpg}
\par Bentley et al., 2008, Nature 456:
\href{http://www.ncbi.nlm.nih.gov/pubmed/18987734}{53-9}
\end{columns}
\end{frame}
{\scriptsize\begin{verbatim}
@ERR127302.1703 HWI-EAS350_0441:1:1:1460:19184#0/1
CCTGAGTGAAGCTGATCTTGATCTACGAAGAGAGATAGATCTTGATCGTCGAGGAGATGCTGACCTTGACCT
+
HHGHHGHHHHHHHHDGG>CE?=896=:
@ERR127302.1704 HWI-EAS350_0441:1:1:1460:16861#0/1
GCGGTATGCTGGAAGGTGCTCGAATGGAGAGCGCCAGCGCCCCGGCGCTGAGCCGCAGCCTCAGGTCCGCCC
+
DE?DD>ED4>EEE>DE8EEEDE8B?EB<@3;BA79?,881B?@73;1?########################
@ERR127302.1705 HWI-EAS350_0441:1:1:1460:13054#0/1
AAAACACCCTGCAATCTTTCAGACAGGATGTTGACAATGCGTCTCTGGCACGTCTTGACCTTGAACGCAAAG
+
EEDEE>AD>BBGGB8E8EEEGBGGGGBGGGGG3G>E3*?BE??BBC8GB8??:??GGDGDDD>D>BGGD8EG,<6D@<@G@>AB@B?8AA>CE@D8@B=?CC>AG
@ERR127302.1707 HWI-EAS350_0441:1:1:1461:6983#0/1
CGACGCTGACACCGGAACGGCAGCAGCAGCAGGACGATTAAGACAAGGAGGATGGCTCCACAGACGCTCATG
+
GEEGEGE@GGGGGGEGGGGGBB>G3?33?8*;;79?<9@?DD8@DDEE888;-BB?.A##############
@ERR127302.1708 HWI-EAS350_0441:1:1:1461:10827#0/1
AAAGAAGGTCCTTGCAATAGACTGCCTCTGCTTGAGAACTTATGATGTAATTATTGCATGCTGCTAATATAC
+
GGGGGDDEBFGGGGGBE,DAGDDGGGEEEGACEBEFDEEFEDH:@.7@49;88G8>G>DDG@D>D@G@GE>@DDBDDG>=
## Use the 'ShortRead' package
library(ShortRead)
## Create an object to represent a sample from a file
sampler <- FastqSampler("ERR127302_1.fastq.gz")
## Apply a method to yield a random sample
fq <- yield(sampler)
## Access sequences of sampled reads using `sread()`
## Summarize nucleotide use by cycle
## 'abc' is a nucleotide x cycle matrix of counts
abc <- alphabetByCycle(sread(fq))
## Subset of interesting nucleotides
abc <- abc[c("A", "C", "G", "T", "N"),]
@
\end{frame}
\begin{frame}[fragile]{Sequencing: The \Biocpkg{ShortRead} package}
\begin{columns}
\column{.5\textwidth}
<>=
## Create a plot from a
## matrix
matplot(t(abc), type="l",
lty=1, lwd=3,
xlab="Cycle",
ylab="Count",
cex.lab=2)
## Add a legend
legend("topright",
legend=rownames(abc),
lty=1, lwd=3, col=1:5,
cex=1.8)
@
\column{.5\textwidth}
\includegraphics[width=\textwidth]{figures/abc}
\end{columns}
\end{frame}
\begin{frame}{Sequencing: Essential packages and classes}
\begin{itemize}
\item \Biocpkg{Biostrings} and \Rclass{DNAStringSet}
\item \Biocpkg{GenomicRanges} and \Rclass{GRanges}
\item \Biocpkg{GenomicFeatures} and \Rclass{TranscriptDb}
\item \Biocpkg{VariantAnnotation} and \Rclass{VCF}
\item Input and output: \Biocpkg{rtracklayer} (WIG, BED, etc.),
\Biocpkg{Rsamtools} (BAM), \Biocpkg{ShortRead} (FASTQ) file input
\end{itemize}
\end{frame}
\section*{Principles}
\begin{frame}{Principles: Some key points}
\begin{itemize}
\item \R{} is a high-level programming language, so lots can be
accomplished with just a little code
\item Packages such as \Biocpkg{ShortRead} provide a great way to
benefit from the expertise of others (and to contribute your own
expertise back to the community!)
\begin{itemize}
\item The path from `user' to `developer' is not that long, and
has been taken by many!
\end{itemize}
\item Objects and methods such as \Rclass{data.frame},
\Rclass{ShortReadQ} and \Rcode{alphabetByCycle()}) help to manage
complicated data
\begin{itemize}
\item Reducing possibility for clerical and other mistakes
\item Facilitating inter-operability between different parts of an
analysis
\end{itemize}
\item Scripts make work flows reproducible
\item Visualizing data is an important part of exploratory analysis
\end{itemize}
\end{frame}
\begin{frame}{Principles: Successful computational biology software}
\begin{enumerate}
\item Extensive: software, annotation, integration
\begin{itemize}
\item 750 inter-operable \Bioconductor{} packages
\end{itemize}
\item Statistical: volume, technology, experimental design
\begin{itemize}
\item \R{} a `natural' for statistical analysis
\end{itemize}
\item Reproducible: long-term, multi-participant science
\begin{itemize}
\item Objects, scripts, vignettes, packages, \ldots encourage
reproducible research
\end{itemize}
\item Leading edge: novel, technology-driven
\begin{itemize}
\item Packages and user community closely track leading edge
science
\end{itemize}
\item Accessible: affordable, transparent, usable
\begin{itemize}
\item \Bioconductor{} is free and open, with extensive
documentation and an active and supportive user community
\end{itemize}
\end{enumerate}
Case study: differential expression of known genes; see also
\href{https://bioconductor.org/help/course-materials/2013/EMBOBGI/reproducible-research.pdf}{reproducible
research} lecture.
\end{frame}
\section*{Exemplars}
\begin{frame}{Exemplars: Algorithms to action}
\begin{enumerate}
\item Batch effects
\item Methylation
\item RNA-seq Differential Representation
\item Visualization
\end{enumerate}
\end{frame}
\begin{frame}{Exemplar: Differential Representation}
Haglund et al., 2012
\href{http://www.ncbi.nlm.nih.gov/pubmed/23024189}{J Clin Endocrin
Metab}
\bigskip\par
\begin{columns}
\column{.5\textwidth}
\begin{itemize}
\item Scientific finding: identify genes whose expression is
regulated by estrogen receptors in parathyroid adenoma cells
\item Statistical challenges: between-sample normalization;
appropriate statistical model; efficient estimation; \ldots
\end{itemize}
\column{.5\textwidth}
\includegraphics[width=\textwidth]{figures/DESeq2_parathyroid-plotMApadjchange.png}
\end{columns}
\bigskip\par\Bioconductor{} support: \Biocpkg{DESeq2}, \Biocpkg{edgeR}, many
statistical `lessons learned' from microarrays; extensive
integration with down-stream tools
\end{frame}
\begin{frame}{Exemplar: Batch Effects}
Leek et al., 2010, Nature Reviews Genetics 11,
\href{http://www.nature.com/nrg/journal/v11/n10/abs/nrg2825.html}{733-739},
Leek \& Story
\href{http://dx.doi.org/10.1371/journal.pgen.0030161}{PLoS Genet
3(9): e161}
\begin{columns}
\column{.5\textwidth}
\begin{itemize}
\item Scientific finding: pervasive batch effects
\item Statistical insights: surrogate variable analysis: identify
and build surrogate variables; remove known batch effects
\item Benefits: reduce dependence, stabilize error rate estimates,
and improve reproducibility
\end{itemize}
\Bioconductor{} support: \Biocpkg{sva}
\column{.5\textwidth}
\only<1>{
\includegraphics[width=\textwidth]{figures/nrg2825-f2.jpg}
\par{\small HapMap samples from one facility, ordered by
date of processing. From }
}
\only<2>{
\begin{enumerate}
\item Remove signal due to variable(s) of interest
\item Identify subset of genes driving orthogonal signatures
of EH
\item Build a surrogate variable based on full EH signature of
that subset
\item Include significant surrogate variables as covariates
\end{enumerate}
EH: expression heterogeneity
}
\end{columns}
\end{frame}
\begin{frame}{Exemplar: Methylation}
Hansen et al., 2011, Nature Genetics 43,
\href{http://www.nature.com/ng/journal/v43/n8/full/ng.865.html}{768-775}
\begin{itemize}
\item Scientific finding: stochastic methylation variation of
cancer-specific de-methylated regions (DMR), distinguishing cancer from
normal tissue, in several cancers.
\item Statistical challenges: smoothing, non-specific filtering, $t$
statistics, find DMRs
\end{itemize}
\bigskip\par
\includegraphics[width=\textwidth]{figures/bsseq_analysis-1.png}
\medskip\par \Bioconductor{} support: whole-genome (\Biocpkg{bsseq})
or reduced representation (\Biocpkg{MethylSeekR}) bisulfite
sequencing; Illumina 450k arrays (\Biocpkg{minfi})
\end{frame}
\begin{frame}{Exemplar: Visualization}
\begin{columns}
\column{.5\textwidth}
\Biocpkg{Gviz}\par
\only<1-2>{
\begin{itemize}
\item Track-like visualizations
\item Data panels
\item Fully integrated with \Bioconductor{} sequence
representations
\end{itemize}
}
\Biocpkg{ggbio}\par
\only<3>{
\begin{itemize}
\item Comprehensive visualizations
\item \Rfunction{autoplot} file and data types
\item Fully integrated with \Bioconductor{} sequence
representations
\end{itemize}
}
\Biocpkg{epivizr}\par
\only<4>{
\begin{itemize}
\item Genome browser with socket communication to \R{}
\item Fully integrated with \Bioconductor{} sequence
representations
\end{itemize}
}
\column{.5\textwidth}
\only<1>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-1.png}}
\only<2>{\includegraphics[width=\textwidth]{figures/Gviz-vignette-2.png}}
\only<3>{\includegraphics[width=\textwidth]{figures/ggbio-vignette-1.png}}
\only<4>{\includegraphics[width=\textwidth]{figures/epivisr.png}}
\end{columns}
\end{frame}
\section*{Challenges \& Opportunities}
\begin{frame}{Challenges \& Opportunities}
\begin{itemize}
\item Big data -- transparent management within \R, facile use of
established resources
\item Developer and user training
\end{itemize}
Resources
\begin{itemize}
\item \url{http://r-project.org}, \emph{An Introduction to \R}
manual; Dalgaard, \emph{Introductory Statistics with \R};
\href{http://rfordummies.com/}{\R{} for Dummies}
\item \url{https://bioconductor.org/}
\item \url{http://rstudio.org}
\item
\href{http://stackoverflow.com/questions/tagged/r}{StackOverflow},
\Bioconductor{}
\href{https://bioconductor.org/help/mailing-list/mailform/}{mailing
list}
\end{itemize}
\end{frame}
\section*{Acknowledgements}
\begin{frame}{Acknowledgements}
\begin{itemize}
\item \Bioconductor{} team: Marc Carlson, Valerie Obenchain, Herv\'e
Pag\`es, Paul Shannon, Dan Tenenbaum
\item Technical advisory council: Vincent Carey, Wolfgang Huber,
Robert Gentleman, Rafael Irizzary, Sean Davis, Kasper Hansen
\item Scientific advisory board: Simon Tavar\'e, Vivian Bonazzi,
Vincent Carey, Wolfgang Huber, Robert Gentleman, Rafael Irizzary,
Paul Flicek, Simon Urbanek.
\item NIH / NHGRI U41HG0004059
\item The \Bioconductor{} community
\item \ldots and the organizers of this course!
\end{itemize}
\end{frame}
\end{document}