## ----style-knitr, echo=FALSE, results="asis"-------------------------------
BiocStyle::latex(relative.path=TRUE)

## ----results="hide", echo=FALSE--------------------------------------------
knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, cache=TRUE)

## ----results='hide',echo=FALSE---------------------------------------------
library(chipseqDBData)
tf.data <- NFYAData()
tf.data <- head(tf.data, -1) # skip the input.
bam.files <- tf.data$Path

cell.type <- sub("NF-YA ([^ ]+) .*", "\\1", tf.data$Description)
design <- model.matrix(~factor(cell.type))
colnames(design) <- c("intercept", "cell.type")

## --------------------------------------------------------------------------
library(csaw)
param <- readParam(minq=20)
data <- windowCounts(bam.files, ext=110, width=10, param=param)

## --------------------------------------------------------------------------
library(edgeR)
keep <- aveLogCPM(asDGEList(data)) >= -1
data <- data[keep,]

## --------------------------------------------------------------------------
binned <- windowCounts(bam.files, bin=TRUE, width=10000, param=param)
data <- normFactors(binned, se.out=data)

## --------------------------------------------------------------------------
y <- asDGEList(data)
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust=TRUE)
results <- glmQLFTest(fit)

## --------------------------------------------------------------------------
merged <- mergeResults(data, results$table, tol=1000L)

## --------------------------------------------------------------------------
library(chipseqDBData)
tf.data <- NFYAData()
tf.data
bam.files <- head(tf.data$Path, -1) # skip the input.
bam.files

## --------------------------------------------------------------------------
frag.len <- 110
win.width <- 10
param <- readParam(minq=20)
data <- windowCounts(bam.files, ext=frag.len, width=win.width, param=param)

## --------------------------------------------------------------------------
head(assay(data))
head(rowRanges(data))
data$totals

## --------------------------------------------------------------------------
param

## --------------------------------------------------------------------------
dedup.param <- readParam(minq=20, dedup=TRUE)
demo <- windowCounts(bam.files, ext=frag.len, width=win.width, 
                     param=dedup.param)
demo$totals

## --------------------------------------------------------------------------
restrict.param <- readParam(restrict=c("chr1", "chr10", "chrX"))

## --------------------------------------------------------------------------
repeats <- GRanges("chr1", IRanges(3000001, 3041000)) # telomere
discard.param <- readParam(discard=repeats)

## --------------------------------------------------------------------------
another.param <- reform(param, minq=20, discard=repeats)
another.param

## --------------------------------------------------------------------------
demo <- windowCounts(bam.files, spacing=100, ext=frag.len, 
                     width=win.width, param=param)
head(rowRanges(demo))

## --------------------------------------------------------------------------
demo <- windowCounts(bam.files, ext=frag.len, width=win.width, 
                     filter=30, param=param)
head(assay(demo))

## --------------------------------------------------------------------------
demo <- windowCounts(bam.files, width=1000, bin=TRUE, param=param)
head(rowRanges(demo))

## --------------------------------------------------------------------------
# Using the BAM file in Rsamtools as an example.
pe.bam <- system.file("extdata", "ex1.bam", package="Rsamtools", 
    mustWork=TRUE)

pe.param <- readParam(max.frag=400, pe="both")
demo <- windowCounts(pe.bam, ext=250, param=pe.param)
demo$totals

## ----frag, fig.small=TRUE--------------------------------------------------
out <- getPESizes(pe.bam)
frag.sizes <- out$sizes[out$sizes<=800]
hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)", 
     ylab="Frequency", main="", col="grey80")
abline(v=400, col="red")

## --------------------------------------------------------------------------
c(out$diagnostics, too.large=sum(out$sizes > 400))

## --------------------------------------------------------------------------
first.param <- readParam(pe="first")
demo <- windowCounts(pe.bam, param=first.param)
demo$totals

## ----ccf, fig.small=TRUE---------------------------------------------------
max.delay <- 500
dedup.on <- reform(param, dedup=TRUE)
x <- correlateReads(bam.files, max.delay, param=dedup.on)
plot(0:max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)")

## --------------------------------------------------------------------------
maximizeCcf(x)

## ----histone_ccf, fig.asp=1------------------------------------------------
n <- 1000

# Using more data sets from 'chipseqDBData'.
acdata <- H3K9acData()
h3k9ac <- correlateReads(acdata$Path[1], n, param=dedup.on)

k27data <- H3K27me3Data()
h3k27me3 <- correlateReads(k27data$Path[1], n, param=dedup.on)

k4data <- H3K4me3Data()
h3k4me3 <- correlateReads(k4data$Path[1], n, param=dedup.on)

plot(0:n, h3k9ac, col="blue", ylim=c(0, 0.1), xlim=c(0, 1000),
     xlab="Delay (bp)", ylab="CCF", pch=16, type="l", lwd=2)
lines(0:n, h3k27me3, col="red", pch=16, lwd=2)
lines(0:n, h3k4me3, col="forestgreen", pch=16, lwd=2)
legend("topright", col=c("blue", "red", "forestgreen"),
       c("H3K9ac", "H3K27me3", "H3K4me3"), pch=16)

## --------------------------------------------------------------------------
multi.frag.lens <- list(c(100, 150, 200, 250), 200)
demo <- windowCounts(bam.files, ext=multi.frag.lens, filter=30, param=param)
demo$ext
metadata(demo)$final

## --------------------------------------------------------------------------
my.regions <- GRanges(c("chr11", "chr12", "chr15"),
                      IRanges(c(75461351, 95943801, 21656501), 
                      c(75461610, 95944810, 21657610)))
reg.counts <- regionCounts(bam.files, my.regions, ext=frag.len, param=param)
head(assay(reg.counts))

## --------------------------------------------------------------------------
ss.param <- reform(param, forward=NULL)
ss.counts <- strandedCounts(bam.files, ext=frag.len, width=win.width, 
                            param=ss.param)
strand(rowRanges(ss.counts))

## ----results='hide',echo=FALSE---------------------------------------------
rm(demo, ss.counts)
gc()

## --------------------------------------------------------------------------
abundances <- aveLogCPM(asDGEList(data))
summary(abundances)

## --------------------------------------------------------------------------
keep.simple <- abundances > -1
filtered.data <- data[keep.simple,]
summary(keep.simple)

## --------------------------------------------------------------------------
keep <- abundances > aveLogCPM(5, lib.size=mean(data$totals))
summary(keep)

## --------------------------------------------------------------------------
keep <- filterWindowsProportion(data)$filter > 0.999
sum(keep)

## --------------------------------------------------------------------------
bin.size <- 2000L
binned <- windowCounts(bam.files, bin=TRUE, width=bin.size, param=param)

## --------------------------------------------------------------------------
filter.stat <- filterWindowsGlobal(data, background=binned)
keep <- filter.stat$filter > log2(3)
sum(keep)

## ----binhist---------------------------------------------------------------
hist(filter.stat$back.abundances, xlab="Adjusted bin log-CPM", breaks=100, 
     main="", col="grey80", xlim=c(min(filter.stat$back.abundances), 0))
global.bg <- filter.stat$abundances - filter.stat$filter
abline(v=global.bg[1], col="red", lwd=2)
abline(v=global.bg[1]+log2(3), col="blue", lwd=2)
legend("topright", lwd=2, col=c('red', 'blue'), 
       legend=c("Background", "Threshold"))

## --------------------------------------------------------------------------
surrounds <- 2000
neighbor <- suppressWarnings(resize(rowRanges(data), surrounds, fix="center"))
wider <- regionCounts(bam.files, regions=neighbor, ext=frag.len, param=param)

## --------------------------------------------------------------------------
filter.stat <- filterWindowsLocal(data, wider)
summary(filter.stat$filter)

## --------------------------------------------------------------------------
keep <- filter.stat$filter > log2(3)
sum(keep)

## --------------------------------------------------------------------------
maxed <- findMaxima(rowRanges(data), range=1000, metric=abundances)
summary(maxed)

## --------------------------------------------------------------------------
tf.data <- NFYAData()
with.input <- tf.data$Path
in.demo <- windowCounts(with.input, ext=frag.len, param=param)
chip <- in.demo[,1:4] # All ChIP libraries
control <- in.demo[,5] # All control libraries

## --------------------------------------------------------------------------
in.binned <- windowCounts(with.input, bin=TRUE, width=10000, param=param)
chip.binned <- in.binned[,1:4]
control.binned <- in.binned[,5]
scale.info <- scaleControlFilter(chip.binned, control.binned)

## --------------------------------------------------------------------------
filter.stat <- filterWindowsControl(chip, control, 
    prior.count=5, scale.info=scale.info)

## --------------------------------------------------------------------------
keep <- filter.stat$filter > log2(3)
sum(keep)

## --------------------------------------------------------------------------
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
broads <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)
broads <- resize(broads, width(broads)+3000, fix="end")
head(broads)

## --------------------------------------------------------------------------
suppressWarnings(keep <- overlapsAny(rowRanges(data), broads))
sum(keep)

## ----results='hide',echo=FALSE---------------------------------------------
rm(binned, in.demo, wider, neighbor, chip, control, filter.stat,
	in.binned, chip.binned, control.binned)
gc()

## --------------------------------------------------------------------------
binned <- windowCounts(bam.files, bin=TRUE, width=10000, param=param)
filtered.data <- normFactors(binned, se.out=filtered.data)
filtered.data$norm.factors

## --------------------------------------------------------------------------
demo <- windowCounts(bam.files, bin=TRUE, width=5000, param=param)
normFactors(demo, se.out=FALSE)
demo <- windowCounts(bam.files, bin=TRUE, width=15000, param=param)
normFactors(demo, se.out=FALSE)

## ----normplot, fig.wide=TRUE, fig.asp=0.4----------------------------------
par(mfrow=c(1, 3), mar=c(5, 4, 2, 1.5))
adj.counts <- cpm(asDGEList(binned), log=TRUE)
normfacs <- filtered.data$norm.factors
for (i in seq_len(length(bam.files)-1)) {
    cur.x <- adj.counts[,1]
    cur.y <- adj.counts[,1+i]
    smoothScatter(x=(cur.x+cur.y)/2+6*log2(10), y=cur.x-cur.y,
                  xlab="A", ylab="M", main=paste("1 vs", i+1))
    all.dist <- diff(log2(normfacs[c(i+1, 1)]))
    abline(h=all.dist, col="red")
}

## --------------------------------------------------------------------------
k4data <- H3K4me3Data()
k4data
me.files <- k4data$Path[c(1,3)]
me.demo <- windowCounts(me.files, width=150, param=param)

## --------------------------------------------------------------------------
me.bin <- windowCounts(me.files, bin=TRUE, width=10000, param=param) 
keep <- filterWindowsGlobal(me.demo, me.bin)$filter > log2(3)
filtered.me <- me.demo[keep,]

## --------------------------------------------------------------------------
filtered.me <- normFactors(filtered.me, se.out=TRUE)
me.eff <- filtered.me$norm.factors
me.eff

## --------------------------------------------------------------------------
acdata <- H3K9acData()
acdata
ac.files <- acdata$Path[c(1,2)]
ac.demo <- windowCounts(ac.files, width=150, param=param)
ac.bin <- windowCounts(ac.files, bin=TRUE, width=10000, param=param)
keep <- filterWindowsGlobal(ac.demo, ac.bin)$filter > log2(5)
summary(keep)
filtered.ac <- ac.demo[keep,]
filtered.ac <- normFactors(filtered.ac, se.out=TRUE)
ac.eff <- filtered.ac$norm.factors
ac.eff

## --------------------------------------------------------------------------
me.comp <- normFactors(me.bin, se.out=FALSE)
me.comp
ac.comp <- normFactors(ac.bin, se.out=FALSE)
ac.comp

## ----methplot, fig.wide=TRUE, fig.asp=0.5----------------------------------
par(mfrow=c(1,2))
for (main in c("H3K4me3", "H3ac")) { 
    if (main=="H3K4me3") { 
        bins <- me.bin
        comp <- me.comp
        eff <- me.eff 
    } else { 
        bins <- ac.bin
        comp <- ac.comp
        eff <- ac.eff 
    }
    adjc <- cpm(asDGEList(bins), log=TRUE)
    smoothScatter(x=rowMeans(adjc), y=adjc[,1]-adjc[,2], 
                  xlab="A", ylab="M", main=main)
    abline(h=log2(eff[1]/eff[2]), col="red")
    abline(h=log2(comp[1]/comp[2]), col="red", lty=2)
}

## --------------------------------------------------------------------------
# Pretend chr1 is a spike-in, for demonstration purposes only!
is.1 <- seqnames(rowRanges(data))=="chr1"
spike.data <- data[is.1,]
endog.data <- data[!is.1,]

endog.data <- normFactors(spike.data, se.out=endog.data)

## --------------------------------------------------------------------------
ac.demo2 <- windowCounts(ac.files, width=2000L, param=param)
filtered <- filterWindowsGlobal(ac.demo2, ac.bin)
keep <- filtered$filter > log2(4)
ac.demo2 <- ac.demo2[keep,]
ac.demo2 <- normOffsets(ac.demo2, se.out=TRUE)
ac.off <- assay(ac.demo2, "offset")
head(ac.off)

## ----trendplot, fig.wide=TRUE, fig.asp=0.5---------------------------------
par(mfrow=c(1,2))

# MA plot without normalization.
ac.y <- asDGEList(ac.demo2)
adjc <- cpm(ac.y, log=TRUE)
abval <- aveLogCPM(ac.y)
mval <- adjc[,1]-adjc[,2]
fit <- loessFit(x=abval, y=mval)
smoothScatter(abval, mval, ylab="M", xlab="Average logCPM", 
              main="Raw", ylim=c(-2,2), xlim=c(0, 7))
o <- order(abval)
lines(abval[o], fit$fitted[o], col="red")

# Repeating after normalization.
re.adjc <- log2(assay(ac.demo2)+0.5) - ac.off/log(2)
mval <- re.adjc[,1]-re.adjc[,2]
fit <- loessFit(x=abval, y=mval)
smoothScatter(abval, re.adjc[,1]-re.adjc[,2], ylab="M", xlab="Average logCPM", 
              main="Normalized", ylim=c(-2,2), xlim=c(0, 7))
lines(abval[o], fit$fitted[o], col="red")

## ----results='hide',echo=FALSE---------------------------------------------
rm(demo, ac.demo, filtered.ac, ac.bin, me.demo, filtered.me, me.bin, ac.demo2, filtered, keep, adjc)
gc()

## --------------------------------------------------------------------------
y <- asDGEList(filtered.data)

## --------------------------------------------------------------------------
tf.data <- NFYAData()
cell.type <- sub("NF-YA ([^ ]+) .*", "\\1", head(tf.data$Description, -1))
cell.type
design <- model.matrix(~factor(cell.type))
colnames(design) <- c("intercept", "cell.type")
design

## --------------------------------------------------------------------------
y <- estimateDisp(y, design)
summary(y$trended.dispersion)
fit <- glmQLFit(y, design, robust=TRUE)
summary(fit$var.post)

## ----dispplot, fig.wide=TRUE, fig.asp=0.5----------------------------------
par(mfrow=c(1,2))
o <- order(y$AveLogCPM)
plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2,
     ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"),
     ylab=("Biological coefficient of variation"))
plotQLDisp(fit)

## ----rmchecker, fig.small=TRUE---------------------------------------------
relevant <- rowSums(assay(data)) >= 20 # weaker filtering than 'filtered.data'
yo <- asDGEList(data[relevant], norm.factors=normfacs)
yo <- estimateDisp(yo, design)
oo <- order(yo$AveLogCPM)
plot(yo$AveLogCPM[oo], sqrt(yo$trended.dispersion[oo]), type="l", lwd=2,
     ylim=c(0, max(sqrt(yo$trended))), xlab=expression("Ave."~Log[2]~"CPM"), 
     ylab=("Biological coefficient of variation"))
lines(y$AveLogCPM[o], sqrt(y$trended[o]), lwd=2, col="grey")
legend("topright", c("raw", "filtered"), col=c("black", "grey"), lwd=2)

## --------------------------------------------------------------------------
summary(fit$df.prior)

## --------------------------------------------------------------------------
results <- glmQLFTest(fit, contrast=c(0, 1))
head(results$table)

## --------------------------------------------------------------------------
rowData(filtered.data) <- cbind(rowData(filtered.data), results$table)

## --------------------------------------------------------------------------
fit.norep <- glmFit(y, design, dispersion=0.05)
results.norep <- glmLRT(fit.norep, contrast=c(0, 1))
head(results.norep$table)

## ----mds, fig.asp=1--------------------------------------------------------
par(mfrow=c(2,2), mar=c(5,4,2,2))
adj.counts <- cpm(y, log=TRUE)
for (top in c(100, 500, 1000, 5000)) {
    out <- plotMDS(adj.counts, main=top, col=c("blue", "blue", "red", "red"),
                   labels=c("es.1", "es.2", "tn.1", "tn.2"), top=top)
}

## ----echo=FALSE,results='hide'---------------------------------------------
rm(adj.counts)
gc()

## --------------------------------------------------------------------------
merged <- mergeWindows(filtered.data, tol=1000L)
merged$regions

## --------------------------------------------------------------------------
summary(width(merged$regions))

## --------------------------------------------------------------------------
merged.max <- mergeWindows(filtered.data, tol=1000L, max.width=5000L)
summary(width(merged.max$regions))

## --------------------------------------------------------------------------
olap <- findOverlaps(broads, rowRanges(filtered.data))
olap

## --------------------------------------------------------------------------
tabcom <- combineTests(merged$ids, results$table)
is.sig.region <- tabcom$FDR <= 0.05
summary(is.sig.region)

## --------------------------------------------------------------------------
table(tabcom$direction[is.sig.region])

## --------------------------------------------------------------------------
tabbroad <- combineOverlaps(olap, results$table)
head(tabbroad[!is.na(tabbroad$PValue),])
is.sig.gene <- tabcom$FDR <= 0.05
table(tabbroad$direction[is.sig.gene])

## --------------------------------------------------------------------------
tab.best <- getBestTest(merged$ids, results$table)
head(tab.best)

## --------------------------------------------------------------------------
tabcom$best.logFC <- tab.best$logFC
tabcom$best.start <- start(rowRanges(filtered.data))[tab.best$best]
head(tabcom[,c("best.logFC", "best.start")])

## --------------------------------------------------------------------------
tab.best.broad <- getBestOverlaps(olap, results$table)
tabbroad$best.logFC <- tab.best.broad$logFC
tabbroad$best.start <- start(rowRanges(filtered.data))[tab.best.broad$best]
head(tabbroad[!is.na(tabbroad$PValue),c("best.logFC", "best.start")])

## --------------------------------------------------------------------------
merge.res <- mergeResults(filtered.data, results$table, tol=100,
    merge.args=list(max.width=5000))
names(merge.res)

## --------------------------------------------------------------------------
broad.res <- overlapResults(filtered.data, regions=broads,
    tab=results$table)
names(broad.res)

## --------------------------------------------------------------------------
ac.small <- windowCounts(ac.files, width=150L, spacing=100L, 
    filter=25, param=param)
ac.large <- windowCounts(ac.files, width=1000L, spacing=500L, 
    filter=35, param=param)

# Mocking up results for demonstration purposes.
ns <- nrow(ac.small)
mock.small <- data.frame(logFC=rnorm(ns), logCPM=0, PValue=runif(ns)) 
nl <- nrow(ac.large)
mock.large <- data.frame(logFC=rnorm(nl), logCPM=0, PValue=runif(nl)) 

## --------------------------------------------------------------------------
cons.res <- mergeResultsList(list(ac.small, ac.large), 
    tab.list=list(mock.small, mock.large), 
    equiweight=TRUE, tol=1000)
cons.res$regions
cons.res$combined

## --------------------------------------------------------------------------
cons.broad <- overlapResultsList(list(ac.small, ac.large),
    tab.list=list(mock.small, mock.large), 
    equiweight=TRUE, region=broads)
cons.broad$regions
cons.res$combined

## --------------------------------------------------------------------------
tab.ave <- getBestTest(merged$id, results$table, by.pval=FALSE)
weights <- upweightSummit(merged$id, tab.ave$best)
head(weights)
tabcom.w <- combineTests(merged$id, results$table, weight=weights)
head(tabcom.w)

## --------------------------------------------------------------------------
broad.best <- getBestOverlaps(olap, results$table, by.pval=FALSE)
head(broad.best[!is.na(broad.best$PValue),])
broad.weights <- summitOverlaps(olap, region.best=broad.best$best)
tabbroad.w <- combineOverlaps(olap, results$table, o.weight=broad.weights) 

## --------------------------------------------------------------------------
postclust <- clusterWindows(rowRanges(filtered.data), results$table,
                            target=0.05, tol=100, max.width=1000)
postclust$FDR
postclust$region

## --------------------------------------------------------------------------
empres <- empiricalFDR(merged$id, results$table)

## --------------------------------------------------------------------------
tab.mixed <- mixedClusters(merged$ids, results$table)
tab.mixed

## --------------------------------------------------------------------------
mcols(broads) <- tabbroad 
mcols(merged$region) <- tabcom

## ----echo=FALSE,results='hide'---------------------------------------------
rm(ac.small, ac.large)
gc()

## --------------------------------------------------------------------------
library(org.Mm.eg.db)
anno <- detailRanges(merged$region, txdb=TxDb.Mmusculus.UCSC.mm10.knownGene,
                     orgdb=org.Mm.eg.db, promoter=c(3000, 1000), dist=5000)
head(anno$overlap)
head(anno$left)
head(anno$right)

## --------------------------------------------------------------------------
merged$region$overlap <- anno$overlap
merged$region$left <- anno$left
merged$region$right <- anno$right

## --------------------------------------------------------------------------
anno.ranges <- detailRanges(txdb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                            orgdb=org.Mm.eg.db)
anno.ranges

## --------------------------------------------------------------------------
spacing <- metadata(data)$spacing
expanded <- resize(merged$region, fix="center", 
                   width=width(merged$region)+spacing)
sbm.score <- checkBimodality(bam.files, expanded, width=frag.len)
head(sbm.score)

## --------------------------------------------------------------------------
ofile <- gzfile("clusters.tsv.gz", open="w")
write.table(as.data.frame(merged$region), file=ofile, 
    row.names=FALSE, quote=FALSE, sep="\t")
close(ofile)

## --------------------------------------------------------------------------
is.sig <- merged$region$FDR <= 0.05
library(rtracklayer)
test <- merged$region[is.sig]
test$score <- -10*log10(merged$region$FDR[is.sig])
names(test) <- paste0("region", 1:sum(is.sig))
export(test, "clusters.bed")
head(read.table("clusters.bed"))

## --------------------------------------------------------------------------
saveRDS(merged$region, "ranges.rds")

## --------------------------------------------------------------------------
cur.region <- GRanges("chr18", IRanges(77806807, 77807165))
extractReads(bam.files[1], cur.region, param=param)

## ----regionplot, fig.asp=1-------------------------------------------------
library(Gviz)
collected <- vector("list", length(bam.files))
for (i in seq_along(bam.files)) { 
    reads <- extractReads(bam.files[i], cur.region, param=param)
    adj.total <- data$totals[i]/1e6
    pcov <- as(coverage(reads[strand(reads)=="+"])/adj.total, "GRanges")
    ncov <- as(coverage(reads[strand(reads)=="-"])/adj.total, "GRanges")
    ptrack <- DataTrack(pcov, type="histogram", lwd=0, fill=rgb(0,0,1,.4), 
                        ylim=c(0,1.1), name=bam.files[i], col.axis="black", 
                        col.title="black")
    ntrack <- DataTrack(ncov, type="histogram", lwd=0, fill=rgb(1,0,0,.4), 
                        ylim=c(0,1.1))
    collected[[i]] <- OverlayTrack(trackList=list(ptrack,ntrack))
}
gax <- GenomeAxisTrack(col="black")
plotTracks(c(gax, collected), from=start(cur.region), to=end(cur.region))

## ----echo=FALSE,results='hide'---------------------------------------------
rm(anno)
gc()

## --------------------------------------------------------------------------
sessionInfo()