## ----style-knitr, echo=FALSE, results="asis"------------------------------- BiocStyle::latex(relative.path=TRUE) ## ----results="hide", echo=FALSE-------------------------------------------- knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, cache=TRUE) ## ----results='hide',echo=FALSE--------------------------------------------- library(chipseqDBData) tf.data <- NFYAData() tf.data <- head(tf.data, -1) # skip the input. bam.files <- tf.data$Path cell.type <- sub("NF-YA ([^ ]+) .*", "\\1", tf.data$Description) design <- model.matrix(~factor(cell.type)) colnames(design) <- c("intercept", "cell.type") ## -------------------------------------------------------------------------- library(csaw) param <- readParam(minq=20) data <- windowCounts(bam.files, ext=110, width=10, param=param) ## -------------------------------------------------------------------------- library(edgeR) keep <- aveLogCPM(asDGEList(data)) >= -1 data <- data[keep,] ## -------------------------------------------------------------------------- binned <- windowCounts(bam.files, bin=TRUE, width=10000, param=param) data <- normFactors(binned, se.out=data) ## -------------------------------------------------------------------------- y <- asDGEList(data) y <- estimateDisp(y, design) fit <- glmQLFit(y, design, robust=TRUE) results <- glmQLFTest(fit) ## -------------------------------------------------------------------------- merged <- mergeResults(data, results$table, tol=1000L) ## -------------------------------------------------------------------------- library(chipseqDBData) tf.data <- NFYAData() tf.data bam.files <- head(tf.data$Path, -1) # skip the input. bam.files ## -------------------------------------------------------------------------- frag.len <- 110 win.width <- 10 param <- readParam(minq=20) data <- windowCounts(bam.files, ext=frag.len, width=win.width, param=param) ## -------------------------------------------------------------------------- head(assay(data)) head(rowRanges(data)) data$totals ## -------------------------------------------------------------------------- param ## -------------------------------------------------------------------------- dedup.param <- readParam(minq=20, dedup=TRUE) demo <- windowCounts(bam.files, ext=frag.len, width=win.width, param=dedup.param) demo$totals ## -------------------------------------------------------------------------- restrict.param <- readParam(restrict=c("chr1", "chr10", "chrX")) ## -------------------------------------------------------------------------- repeats <- GRanges("chr1", IRanges(3000001, 3041000)) # telomere discard.param <- readParam(discard=repeats) ## -------------------------------------------------------------------------- another.param <- reform(param, minq=20, discard=repeats) another.param ## -------------------------------------------------------------------------- demo <- windowCounts(bam.files, spacing=100, ext=frag.len, width=win.width, param=param) head(rowRanges(demo)) ## -------------------------------------------------------------------------- demo <- windowCounts(bam.files, ext=frag.len, width=win.width, filter=30, param=param) head(assay(demo)) ## -------------------------------------------------------------------------- demo <- windowCounts(bam.files, width=1000, bin=TRUE, param=param) head(rowRanges(demo)) ## -------------------------------------------------------------------------- # Using the BAM file in Rsamtools as an example. pe.bam <- system.file("extdata", "ex1.bam", package="Rsamtools", mustWork=TRUE) pe.param <- readParam(max.frag=400, pe="both") demo <- windowCounts(pe.bam, ext=250, param=pe.param) demo$totals ## ----frag, fig.small=TRUE-------------------------------------------------- out <- getPESizes(pe.bam) frag.sizes <- out$sizes[out$sizes<=800] hist(frag.sizes, breaks=50, xlab="Fragment sizes (bp)", ylab="Frequency", main="", col="grey80") abline(v=400, col="red") ## -------------------------------------------------------------------------- c(out$diagnostics, too.large=sum(out$sizes > 400)) ## -------------------------------------------------------------------------- first.param <- readParam(pe="first") demo <- windowCounts(pe.bam, param=first.param) demo$totals ## ----ccf, fig.small=TRUE--------------------------------------------------- max.delay <- 500 dedup.on <- reform(param, dedup=TRUE) x <- correlateReads(bam.files, max.delay, param=dedup.on) plot(0:max.delay, x, type="l", ylab="CCF", xlab="Delay (bp)") ## -------------------------------------------------------------------------- maximizeCcf(x) ## ----histone_ccf, fig.asp=1------------------------------------------------ n <- 1000 # Using more data sets from 'chipseqDBData'. acdata <- H3K9acData() h3k9ac <- correlateReads(acdata$Path[1], n, param=dedup.on) k27data <- H3K27me3Data() h3k27me3 <- correlateReads(k27data$Path[1], n, param=dedup.on) k4data <- H3K4me3Data() h3k4me3 <- correlateReads(k4data$Path[1], n, param=dedup.on) plot(0:n, h3k9ac, col="blue", ylim=c(0, 0.1), xlim=c(0, 1000), xlab="Delay (bp)", ylab="CCF", pch=16, type="l", lwd=2) lines(0:n, h3k27me3, col="red", pch=16, lwd=2) lines(0:n, h3k4me3, col="forestgreen", pch=16, lwd=2) legend("topright", col=c("blue", "red", "forestgreen"), c("H3K9ac", "H3K27me3", "H3K4me3"), pch=16) ## -------------------------------------------------------------------------- multi.frag.lens <- list(c(100, 150, 200, 250), 200) demo <- windowCounts(bam.files, ext=multi.frag.lens, filter=30, param=param) demo$ext metadata(demo)$final ## -------------------------------------------------------------------------- my.regions <- GRanges(c("chr11", "chr12", "chr15"), IRanges(c(75461351, 95943801, 21656501), c(75461610, 95944810, 21657610))) reg.counts <- regionCounts(bam.files, my.regions, ext=frag.len, param=param) head(assay(reg.counts)) ## -------------------------------------------------------------------------- ss.param <- reform(param, forward=NULL) ss.counts <- strandedCounts(bam.files, ext=frag.len, width=win.width, param=ss.param) strand(rowRanges(ss.counts)) ## ----results='hide',echo=FALSE--------------------------------------------- rm(demo, ss.counts) gc() ## -------------------------------------------------------------------------- abundances <- aveLogCPM(asDGEList(data)) summary(abundances) ## -------------------------------------------------------------------------- keep.simple <- abundances > -1 filtered.data <- data[keep.simple,] summary(keep.simple) ## -------------------------------------------------------------------------- keep <- abundances > aveLogCPM(5, lib.size=mean(data$totals)) summary(keep) ## -------------------------------------------------------------------------- keep <- filterWindowsProportion(data)$filter > 0.999 sum(keep) ## -------------------------------------------------------------------------- bin.size <- 2000L binned <- windowCounts(bam.files, bin=TRUE, width=bin.size, param=param) ## -------------------------------------------------------------------------- filter.stat <- filterWindowsGlobal(data, background=binned) keep <- filter.stat$filter > log2(3) sum(keep) ## ----binhist--------------------------------------------------------------- hist(filter.stat$back.abundances, xlab="Adjusted bin log-CPM", breaks=100, main="", col="grey80", xlim=c(min(filter.stat$back.abundances), 0)) global.bg <- filter.stat$abundances - filter.stat$filter abline(v=global.bg[1], col="red", lwd=2) abline(v=global.bg[1]+log2(3), col="blue", lwd=2) legend("topright", lwd=2, col=c('red', 'blue'), legend=c("Background", "Threshold")) ## -------------------------------------------------------------------------- surrounds <- 2000 neighbor <- suppressWarnings(resize(rowRanges(data), surrounds, fix="center")) wider <- regionCounts(bam.files, regions=neighbor, ext=frag.len, param=param) ## -------------------------------------------------------------------------- filter.stat <- filterWindowsLocal(data, wider) summary(filter.stat$filter) ## -------------------------------------------------------------------------- keep <- filter.stat$filter > log2(3) sum(keep) ## -------------------------------------------------------------------------- maxed <- findMaxima(rowRanges(data), range=1000, metric=abundances) summary(maxed) ## -------------------------------------------------------------------------- tf.data <- NFYAData() with.input <- tf.data$Path in.demo <- windowCounts(with.input, ext=frag.len, param=param) chip <- in.demo[,1:4] # All ChIP libraries control <- in.demo[,5] # All control libraries ## -------------------------------------------------------------------------- in.binned <- windowCounts(with.input, bin=TRUE, width=10000, param=param) chip.binned <- in.binned[,1:4] control.binned <- in.binned[,5] scale.info <- scaleControlFilter(chip.binned, control.binned) ## -------------------------------------------------------------------------- filter.stat <- filterWindowsControl(chip, control, prior.count=5, scale.info=scale.info) ## -------------------------------------------------------------------------- keep <- filter.stat$filter > log2(3) sum(keep) ## -------------------------------------------------------------------------- library(TxDb.Mmusculus.UCSC.mm10.knownGene) broads <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene) broads <- resize(broads, width(broads)+3000, fix="end") head(broads) ## -------------------------------------------------------------------------- suppressWarnings(keep <- overlapsAny(rowRanges(data), broads)) sum(keep) ## ----results='hide',echo=FALSE--------------------------------------------- rm(binned, in.demo, wider, neighbor, chip, control, filter.stat, in.binned, chip.binned, control.binned) gc() ## -------------------------------------------------------------------------- binned <- windowCounts(bam.files, bin=TRUE, width=10000, param=param) filtered.data <- normFactors(binned, se.out=filtered.data) filtered.data$norm.factors ## -------------------------------------------------------------------------- demo <- windowCounts(bam.files, bin=TRUE, width=5000, param=param) normFactors(demo, se.out=FALSE) demo <- windowCounts(bam.files, bin=TRUE, width=15000, param=param) normFactors(demo, se.out=FALSE) ## ----normplot, fig.wide=TRUE, fig.asp=0.4---------------------------------- par(mfrow=c(1, 3), mar=c(5, 4, 2, 1.5)) adj.counts <- cpm(asDGEList(binned), log=TRUE) normfacs <- filtered.data$norm.factors for (i in seq_len(length(bam.files)-1)) { cur.x <- adj.counts[,1] cur.y <- adj.counts[,1+i] smoothScatter(x=(cur.x+cur.y)/2+6*log2(10), y=cur.x-cur.y, xlab="A", ylab="M", main=paste("1 vs", i+1)) all.dist <- diff(log2(normfacs[c(i+1, 1)])) abline(h=all.dist, col="red") } ## -------------------------------------------------------------------------- k4data <- H3K4me3Data() k4data me.files <- k4data$Path[c(1,3)] me.demo <- windowCounts(me.files, width=150, param=param) ## -------------------------------------------------------------------------- me.bin <- windowCounts(me.files, bin=TRUE, width=10000, param=param) keep <- filterWindowsGlobal(me.demo, me.bin)$filter > log2(3) filtered.me <- me.demo[keep,] ## -------------------------------------------------------------------------- filtered.me <- normFactors(filtered.me, se.out=TRUE) me.eff <- filtered.me$norm.factors me.eff ## -------------------------------------------------------------------------- acdata <- H3K9acData() acdata ac.files <- acdata$Path[c(1,2)] ac.demo <- windowCounts(ac.files, width=150, param=param) ac.bin <- windowCounts(ac.files, bin=TRUE, width=10000, param=param) keep <- filterWindowsGlobal(ac.demo, ac.bin)$filter > log2(5) summary(keep) filtered.ac <- ac.demo[keep,] filtered.ac <- normFactors(filtered.ac, se.out=TRUE) ac.eff <- filtered.ac$norm.factors ac.eff ## -------------------------------------------------------------------------- me.comp <- normFactors(me.bin, se.out=FALSE) me.comp ac.comp <- normFactors(ac.bin, se.out=FALSE) ac.comp ## ----methplot, fig.wide=TRUE, fig.asp=0.5---------------------------------- par(mfrow=c(1,2)) for (main in c("H3K4me3", "H3ac")) { if (main=="H3K4me3") { bins <- me.bin comp <- me.comp eff <- me.eff } else { bins <- ac.bin comp <- ac.comp eff <- ac.eff } adjc <- cpm(asDGEList(bins), log=TRUE) smoothScatter(x=rowMeans(adjc), y=adjc[,1]-adjc[,2], xlab="A", ylab="M", main=main) abline(h=log2(eff[1]/eff[2]), col="red") abline(h=log2(comp[1]/comp[2]), col="red", lty=2) } ## -------------------------------------------------------------------------- # Pretend chr1 is a spike-in, for demonstration purposes only! is.1 <- seqnames(rowRanges(data))=="chr1" spike.data <- data[is.1,] endog.data <- data[!is.1,] endog.data <- normFactors(spike.data, se.out=endog.data) ## -------------------------------------------------------------------------- ac.demo2 <- windowCounts(ac.files, width=2000L, param=param) filtered <- filterWindowsGlobal(ac.demo2, ac.bin) keep <- filtered$filter > log2(4) ac.demo2 <- ac.demo2[keep,] ac.demo2 <- normOffsets(ac.demo2, se.out=TRUE) ac.off <- assay(ac.demo2, "offset") head(ac.off) ## ----trendplot, fig.wide=TRUE, fig.asp=0.5--------------------------------- par(mfrow=c(1,2)) # MA plot without normalization. ac.y <- asDGEList(ac.demo2) adjc <- cpm(ac.y, log=TRUE) abval <- aveLogCPM(ac.y) mval <- adjc[,1]-adjc[,2] fit <- loessFit(x=abval, y=mval) smoothScatter(abval, mval, ylab="M", xlab="Average logCPM", main="Raw", ylim=c(-2,2), xlim=c(0, 7)) o <- order(abval) lines(abval[o], fit$fitted[o], col="red") # Repeating after normalization. re.adjc <- log2(assay(ac.demo2)+0.5) - ac.off/log(2) mval <- re.adjc[,1]-re.adjc[,2] fit <- loessFit(x=abval, y=mval) smoothScatter(abval, re.adjc[,1]-re.adjc[,2], ylab="M", xlab="Average logCPM", main="Normalized", ylim=c(-2,2), xlim=c(0, 7)) lines(abval[o], fit$fitted[o], col="red") ## ----results='hide',echo=FALSE--------------------------------------------- rm(demo, ac.demo, filtered.ac, ac.bin, me.demo, filtered.me, me.bin, ac.demo2, filtered, keep, adjc) gc() ## -------------------------------------------------------------------------- y <- asDGEList(filtered.data) ## -------------------------------------------------------------------------- tf.data <- NFYAData() cell.type <- sub("NF-YA ([^ ]+) .*", "\\1", head(tf.data$Description, -1)) cell.type design <- model.matrix(~factor(cell.type)) colnames(design) <- c("intercept", "cell.type") design ## -------------------------------------------------------------------------- y <- estimateDisp(y, design) summary(y$trended.dispersion) fit <- glmQLFit(y, design, robust=TRUE) summary(fit$var.post) ## ----dispplot, fig.wide=TRUE, fig.asp=0.5---------------------------------- par(mfrow=c(1,2)) o <- order(y$AveLogCPM) plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2, ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation")) plotQLDisp(fit) ## ----rmchecker, fig.small=TRUE--------------------------------------------- relevant <- rowSums(assay(data)) >= 20 # weaker filtering than 'filtered.data' yo <- asDGEList(data[relevant], norm.factors=normfacs) yo <- estimateDisp(yo, design) oo <- order(yo$AveLogCPM) plot(yo$AveLogCPM[oo], sqrt(yo$trended.dispersion[oo]), type="l", lwd=2, ylim=c(0, max(sqrt(yo$trended))), xlab=expression("Ave."~Log[2]~"CPM"), ylab=("Biological coefficient of variation")) lines(y$AveLogCPM[o], sqrt(y$trended[o]), lwd=2, col="grey") legend("topright", c("raw", "filtered"), col=c("black", "grey"), lwd=2) ## -------------------------------------------------------------------------- summary(fit$df.prior) ## -------------------------------------------------------------------------- results <- glmQLFTest(fit, contrast=c(0, 1)) head(results$table) ## -------------------------------------------------------------------------- rowData(filtered.data) <- cbind(rowData(filtered.data), results$table) ## -------------------------------------------------------------------------- fit.norep <- glmFit(y, design, dispersion=0.05) results.norep <- glmLRT(fit.norep, contrast=c(0, 1)) head(results.norep$table) ## ----mds, fig.asp=1-------------------------------------------------------- par(mfrow=c(2,2), mar=c(5,4,2,2)) adj.counts <- cpm(y, log=TRUE) for (top in c(100, 500, 1000, 5000)) { out <- plotMDS(adj.counts, main=top, col=c("blue", "blue", "red", "red"), labels=c("es.1", "es.2", "tn.1", "tn.2"), top=top) } ## ----echo=FALSE,results='hide'--------------------------------------------- rm(adj.counts) gc() ## -------------------------------------------------------------------------- merged <- mergeWindows(filtered.data, tol=1000L) merged$regions ## -------------------------------------------------------------------------- summary(width(merged$regions)) ## -------------------------------------------------------------------------- merged.max <- mergeWindows(filtered.data, tol=1000L, max.width=5000L) summary(width(merged.max$regions)) ## -------------------------------------------------------------------------- olap <- findOverlaps(broads, rowRanges(filtered.data)) olap ## -------------------------------------------------------------------------- tabcom <- combineTests(merged$ids, results$table) is.sig.region <- tabcom$FDR <= 0.05 summary(is.sig.region) ## -------------------------------------------------------------------------- table(tabcom$direction[is.sig.region]) ## -------------------------------------------------------------------------- tabbroad <- combineOverlaps(olap, results$table) head(tabbroad[!is.na(tabbroad$PValue),]) is.sig.gene <- tabcom$FDR <= 0.05 table(tabbroad$direction[is.sig.gene]) ## -------------------------------------------------------------------------- tab.best <- getBestTest(merged$ids, results$table) head(tab.best) ## -------------------------------------------------------------------------- tabcom$best.logFC <- tab.best$logFC tabcom$best.start <- start(rowRanges(filtered.data))[tab.best$best] head(tabcom[,c("best.logFC", "best.start")]) ## -------------------------------------------------------------------------- tab.best.broad <- getBestOverlaps(olap, results$table) tabbroad$best.logFC <- tab.best.broad$logFC tabbroad$best.start <- start(rowRanges(filtered.data))[tab.best.broad$best] head(tabbroad[!is.na(tabbroad$PValue),c("best.logFC", "best.start")]) ## -------------------------------------------------------------------------- merge.res <- mergeResults(filtered.data, results$table, tol=100, merge.args=list(max.width=5000)) names(merge.res) ## -------------------------------------------------------------------------- broad.res <- overlapResults(filtered.data, regions=broads, tab=results$table) names(broad.res) ## -------------------------------------------------------------------------- ac.small <- windowCounts(ac.files, width=150L, spacing=100L, filter=25, param=param) ac.large <- windowCounts(ac.files, width=1000L, spacing=500L, filter=35, param=param) # Mocking up results for demonstration purposes. ns <- nrow(ac.small) mock.small <- data.frame(logFC=rnorm(ns), logCPM=0, PValue=runif(ns)) nl <- nrow(ac.large) mock.large <- data.frame(logFC=rnorm(nl), logCPM=0, PValue=runif(nl)) ## -------------------------------------------------------------------------- cons.res <- mergeResultsList(list(ac.small, ac.large), tab.list=list(mock.small, mock.large), equiweight=TRUE, tol=1000) cons.res$regions cons.res$combined ## -------------------------------------------------------------------------- cons.broad <- overlapResultsList(list(ac.small, ac.large), tab.list=list(mock.small, mock.large), equiweight=TRUE, region=broads) cons.broad$regions cons.res$combined ## -------------------------------------------------------------------------- tab.ave <- getBestTest(merged$id, results$table, by.pval=FALSE) weights <- upweightSummit(merged$id, tab.ave$best) head(weights) tabcom.w <- combineTests(merged$id, results$table, weight=weights) head(tabcom.w) ## -------------------------------------------------------------------------- broad.best <- getBestOverlaps(olap, results$table, by.pval=FALSE) head(broad.best[!is.na(broad.best$PValue),]) broad.weights <- summitOverlaps(olap, region.best=broad.best$best) tabbroad.w <- combineOverlaps(olap, results$table, o.weight=broad.weights) ## -------------------------------------------------------------------------- postclust <- clusterWindows(rowRanges(filtered.data), results$table, target=0.05, tol=100, max.width=1000) postclust$FDR postclust$region ## -------------------------------------------------------------------------- empres <- empiricalFDR(merged$id, results$table) ## -------------------------------------------------------------------------- tab.mixed <- mixedClusters(merged$ids, results$table) tab.mixed ## -------------------------------------------------------------------------- mcols(broads) <- tabbroad mcols(merged$region) <- tabcom ## ----echo=FALSE,results='hide'--------------------------------------------- rm(ac.small, ac.large) gc() ## -------------------------------------------------------------------------- library(org.Mm.eg.db) anno <- detailRanges(merged$region, txdb=TxDb.Mmusculus.UCSC.mm10.knownGene, orgdb=org.Mm.eg.db, promoter=c(3000, 1000), dist=5000) head(anno$overlap) head(anno$left) head(anno$right) ## -------------------------------------------------------------------------- merged$region$overlap <- anno$overlap merged$region$left <- anno$left merged$region$right <- anno$right ## -------------------------------------------------------------------------- anno.ranges <- detailRanges(txdb=TxDb.Mmusculus.UCSC.mm10.knownGene, orgdb=org.Mm.eg.db) anno.ranges ## -------------------------------------------------------------------------- spacing <- metadata(data)$spacing expanded <- resize(merged$region, fix="center", width=width(merged$region)+spacing) sbm.score <- checkBimodality(bam.files, expanded, width=frag.len) head(sbm.score) ## -------------------------------------------------------------------------- ofile <- gzfile("clusters.tsv.gz", open="w") write.table(as.data.frame(merged$region), file=ofile, row.names=FALSE, quote=FALSE, sep="\t") close(ofile) ## -------------------------------------------------------------------------- is.sig <- merged$region$FDR <= 0.05 library(rtracklayer) test <- merged$region[is.sig] test$score <- -10*log10(merged$region$FDR[is.sig]) names(test) <- paste0("region", 1:sum(is.sig)) export(test, "clusters.bed") head(read.table("clusters.bed")) ## -------------------------------------------------------------------------- saveRDS(merged$region, "ranges.rds") ## -------------------------------------------------------------------------- cur.region <- GRanges("chr18", IRanges(77806807, 77807165)) extractReads(bam.files[1], cur.region, param=param) ## ----regionplot, fig.asp=1------------------------------------------------- library(Gviz) collected <- vector("list", length(bam.files)) for (i in seq_along(bam.files)) { reads <- extractReads(bam.files[i], cur.region, param=param) adj.total <- data$totals[i]/1e6 pcov <- as(coverage(reads[strand(reads)=="+"])/adj.total, "GRanges") ncov <- as(coverage(reads[strand(reads)=="-"])/adj.total, "GRanges") ptrack <- DataTrack(pcov, type="histogram", lwd=0, fill=rgb(0,0,1,.4), ylim=c(0,1.1), name=bam.files[i], col.axis="black", col.title="black") ntrack <- DataTrack(ncov, type="histogram", lwd=0, fill=rgb(1,0,0,.4), ylim=c(0,1.1)) collected[[i]] <- OverlayTrack(trackList=list(ptrack,ntrack)) } gax <- GenomeAxisTrack(col="black") plotTracks(c(gax, collected), from=start(cur.region), to=end(cur.region)) ## ----echo=FALSE,results='hide'--------------------------------------------- rm(anno) gc() ## -------------------------------------------------------------------------- sessionInfo()