## ---- echo=FALSE, results="hide", warning=FALSE------------------------------- suppressPackageStartupMessages({ library(ChIPpeakAnno) library(org.Hs.eg.db) library(GenomicFeatures) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(rtracklayer) library(GO.db) library(EnsDb.Hsapiens.v75) library(BSgenome.Ecoli.NCBI.20080805) }) knitr::opts_chunk$set(warning=FALSE, message=FALSE) ## ----quickStart--------------------------------------------------------------- library(ChIPpeakAnno) ## import the MACS output macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno") macsOutput <- toGRanges(macs, format="MACS") ## annotate the peaks with precompiled ensembl annotation data(TSS.human.GRCh38) macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38) ## add gene symbols library(org.Hs.eg.db) macs.anno <- addGeneIDs(annotatedPeak=macs.anno, orgAnn="org.Hs.eg.db", IDs2Add="symbol") if(interactive()){## annotate the peaks with UCSC annotation library(GenomicFeatures) library(TxDb.Hsapiens.UCSC.hg38.knownGene) ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene) macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=ucsc.hg38.knownGene) macs.anno <- addGeneIDs(annotatedPeak=macs.anno, orgAnn="org.Hs.eg.db", feature_id_type="entrez_id", IDs2Add="symbol") } ## ----workflow19, fig.cap="venn diagram of overlaps for duplicated experiments"---- bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno") gr1 <- toGRanges(bed, format="BED", header=FALSE) ## one can also try import from rtracklayer library(rtracklayer) gr1.import <- import(bed, format="BED") identical(start(gr1), start(gr1.import)) gr1[1:2] gr1.import[1:2] #note the name slot is different from gr1 gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno") gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3) ol <- findOverlapsOfPeaks(gr1, gr2) makeVennDiagram(ol, fill=c("#009E73", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2"), #circle border color cat.col=c("#D55E00", "#0072B2")) ## ----workflow20,fig.cap="Pie chart of common peaks among features"------------ pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature)) ## ----workflow21--------------------------------------------------------------- overlaps <- ol$peaklist[["gr1///gr2"]] ## ============== old style =========== ## data(TSS.human.GRCh37) ## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, ## output="overlapping", maxgap=5000L) ## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol") ## ============== new style =========== library(EnsDb.Hsapiens.v75) ##(hg19) ## create annotation file from EnsDb or TxDb annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene") annoData[1:2] overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, output="overlapping", maxgap=5000L) overlaps.anno$gene_name <- annoData$gene_name[match(overlaps.anno$feature, names(annoData))] head(overlaps.anno) ## ----workflow22,fig.cap="Distribution of aggregated peak scores or peak numbers around transcript start sites.",fig.width=8,fig.height=6---- gr1.copy <- gr1 gr1.copy$score <- 1 binOverFeature(gr1, gr1.copy, annotationData=annoData, radius=5000, nbins=10, FUN=c(sum, length), ylab=c("score", "count"), main=c("Distribution of aggregated peak scores around TSS", "Distribution of aggregated peak numbers around TSS")) ## ----workflow23,fig.cap="Peak distribution over different genomic features.",fig.width=10,fig.height=4---- if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){ aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE, precedence=c("Promoters", "immediateDownstream", "fiveUTRs", "threeUTRs", "Exons", "Introns"), TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene) barplot(aCR$percentage) } ## ----findOverlapsOfPeaks3----------------------------------------------------- peaks1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "2", "6", "6", "6", "6", "5"), ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 3123260, 3857501, 201089, 1543200, 1557200, 1563000, 1569800, 167889600), end= c(967754, 2010997, 2496804, 3075969, 3123360, 3857601, 201089, 1555199, 1560599, 1565199, 1573799, 167893599), names=paste("Site", 1:12, sep="")), strand="+") peaks2 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5", "6", "6", "6", "6", "6", "5"), ranges=IRanges(start=c(967659, 2010898, 2496700, 3075866, 3123260, 3857500, 96765, 201089, 249670, 307586, 312326, 385750, 1549800, 1554400, 1565000, 1569400, 167888600), end=c(967869, 2011108, 2496920, 3076166,3123470, 3857780, 96985, 201299, 249890, 307796, 312586, 385960, 1550599, 1560799, 1565399, 1571199, 167888999), names=paste("t", 1:17, sep="")), strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", "-", "-", "-", "+", "+", "+", "+", "+")) ol <- findOverlapsOfPeaks(peaks1, peaks2, maxgap=1000) peaklist <- ol$peaklist ## ----overlappingPeaks4,fig.cap="Pie chart of common peaks among features."---- overlappingPeaks <- ol$overlappingPeaks names(overlappingPeaks) dim(overlappingPeaks[["peaks1///peaks2"]]) overlappingPeaks[["peaks1///peaks2"]][1:2, ] pie1(table(overlappingPeaks[["peaks1///peaks2"]]$overlapFeature)) ## ----overlappingPeaks5-------------------------------------------------------- peaklist[["peaks1///peaks2"]] ## ----6------------------------------------------------------------------------ peaklist[["peaks1"]] ## ----7------------------------------------------------------------------------ peaklist[["peaks2"]] ## ----findOverlapsOfPeaks8,fig.cap="venn diagram of overlaps",fig.width=6,fig.height=6---- makeVennDiagram(ol, totalTest=1e+2, fill=c("#009E73", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2"), #circle border color cat.col=c("#D55E00", "#0072B2")) ## ----VennerableFigure--------------------------------------------------------- # install.packages("Vennerable", repos="http://R-Forge.R-project.org", # type="source") # library(Vennerable) # venn_cnt2venn <- function(venn_cnt){ # n <- which(colnames(venn_cnt)=="Counts") - 1 # SetNames=colnames(venn_cnt)[1:n] # Weight=venn_cnt[,"Counts"] # names(Weight) <- apply(venn_cnt[,1:n], 1, base::paste, collapse="") # Venn(SetNames=SetNames, Weight=Weight) # } # # v <- venn_cnt2venn(ol$venn_cnt) # plot(v) ## ----findOverlapsOfPeaks9,fig.cap="venn diagram of overlaps for three input peak lists",fig.width=6,fig.height=6---- peaks3 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3", "4"), ranges=IRanges(start=c(967859, 2010868, 2496500, 3075966, 3123460, 3851500, 96865, 201189, 249600, 307386), end= c(967969, 2011908, 2496720, 3076166, 3123470, 3857680, 96985, 201299, 249890, 307796), names=paste("p", 1:10, sep="")), strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", "-")) ol <- findOverlapsOfPeaks(peaks1, peaks2, peaks3, maxgap=1000, connectedPeaks="min") makeVennDiagram(ol, totalTest=1e+2, fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color cat.col=c("#D55E00", "#0072B2", "#E69F00")) ## ----annoGRgene--------------------------------------------------------------- library(TxDb.Hsapiens.UCSC.hg19.knownGene) annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene") annoData ## ----annotatePeakInBatchByannoGR---------------------------------------------- data(myPeakList) annotatedPeak <- annotatePeakInBatch(myPeakList[1:6], AnnotationData = annoData) annotatedPeak[1:3] ## ----annotatePeakInBatch1----------------------------------------------------- data(TSS.human.NCBI36) annotatedPeak <- annotatePeakInBatch(myPeakList[1:6], AnnotationData=TSS.human.NCBI36) annotatedPeak[1:3] ## ----annotatePeakInBatch2----------------------------------------------------- myPeak1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "2", "6", "6", "6", "6", "5"), ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 3123260, 3857501, 201089, 1543200, 1557200, 1563000, 1569800, 167889600), end= c(967754, 2010997, 2496804, 3075969, 3123360, 3857601, 201089, 1555199, 1560599, 1565199, 1573799, 167893599), names=paste("Site", 1:12, sep=""))) TFbindingSites <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3", "4", "5", "6", "6", "6", "6", "6", "5"), ranges=IRanges(start=c(967659, 2010898, 2496700, 3075866, 3123260, 3857500, 96765, 201089, 249670, 307586, 312326, 385750, 1549800, 1554400, 1565000, 1569400, 167888600), end=c(967869, 2011108, 2496920, 3076166,3123470, 3857780, 96985, 201299, 249890, 307796, 312586, 385960, 1550599, 1560799, 1565399, 1571199, 167888999), names=paste("t", 1:17, sep="")), strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", "-", "-", "-", "+", "+", "+", "+", "+")) annotatedPeak2 <- annotatePeakInBatch(myPeak1, AnnotationData=TFbindingSites) annotatedPeak2[1:3] ## ----pie1,fig.cap="Pie chart of peak distribution among features",fig.width=5,fig.height=5---- pie1(table(as.data.frame(annotatedPeak2)$insideFeature)) ## ----annotatedPromoter-------------------------------------------------------- library(ChIPpeakAnno) data(myPeakList) data(TSS.human.NCBI36) annotationData <- promoters(TSS.human.NCBI36, upstream=5000, downstream=500) annotatedPeak <- annotatePeakInBatch(myPeakList[1:6,], AnnotationData=annotationData, output="overlapping") annotatedPeak[1:3] ## ----------------------------------------------------------------------------- annotatedPeak <- annotatePeakInBatch(myPeakList[1:6], AnnotationData = annoData, output="both", maxgap=5000) head(annotatedPeak) ## ----addGeneIDs18------------------------------------------------------------- data(annotatedPeak) library(org.Hs.eg.db) addGeneIDs(annotatedPeak[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol")) addGeneIDs(annotatedPeak$feature[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol")) ## ----getAllPeakSequence12----------------------------------------------------- peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"), ranges=IRanges(start=c(100, 500), end=c(300, 600), names=c("peak1", "peak2"))) library(BSgenome.Ecoli.NCBI.20080805) peaksWithSequences <- getAllPeakSequence(peaks, upstream=20, downstream=20, genome=Ecoli) ## ----write2FASTA13------------------------------------------------------------ write2FASTA(peaksWithSequences,"test.fa") ## ----heatmap,fig.cap="Heatmap of aligned features",fig.width=4,fig.height=6---- path <- system.file("extdata", package="ChIPpeakAnno") files <- dir(path, "broadPeak") data <- sapply(file.path(path, files), toGRanges, format="broadPeak") names(data) <- gsub(".broadPeak", "", files) ol <- findOverlapsOfPeaks(data) #makeVennDiagram(ol) features <- ol$peaklist[[length(ol$peaklist)]] wid <- width(features) feature.recentered <- feature.center <- features start(feature.center) <- start(features) + floor(wid/2) width(feature.center) <- 1 start(feature.recentered) <- start(feature.center) - 2000 end(feature.recentered) <- end(feature.center) + 2000 ## here we also suggest importData function in bioconductor trackViewer package ## to import the coverage. ## compare rtracklayer, it will save you time when handle huge dataset. library(rtracklayer) files <- dir(path, "bigWig") if(.Platform$OS.type != "windows"){ cvglists <- sapply(file.path(path, files), import, format="BigWig", which=feature.recentered, as="RleList") }else{## rtracklayer can not import bigWig files on Windows load(file.path(path, "cvglist.rds")) } names(cvglists) <- gsub(".bigWig", "", files) sig <- featureAlignedSignal(cvglists, feature.center, upstream=2000, downstream=2000) heatmap <- featureAlignedHeatmap(sig, feature.center, upstream=2000, downstream=2000, upper.extreme=c(3,.5,4)) ## ----distribution,fig.cap="Distribution of aligned features",fig.width=6,fig.height=6---- featureAlignedDistribution(sig, feature.center, upstream=2000, downstream=2000, type="l") ## ----summarizePatternInPeaks17------------------------------------------------ peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"), ranges=IRanges(start=c(100, 500), end=c(300, 600), names=c("peak1", "peak2"))) filepath <- system.file("extdata", "examplePattern.fa", package="ChIPpeakAnno") readLines(filepath) library(BSgenome.Ecoli.NCBI.20080805) summarizePatternInPeaks(patternFilePath=filepath, format="fasta", skip=0L, BSgenomeName=Ecoli, peaks=peaks) ## ----getEnriched14------------------------------------------------------------ library(org.Hs.eg.db) over <- getEnrichedGO(annotatedPeak[1:500], orgAnn="org.Hs.eg.db", maxP=0.01, minGOterm=10, multiAdjMethod="BH", condense=FALSE) head(over[["bp"]][, -3]) head(over[["cc"]][, -3]) head(over[["mf"]][, -3]) ## ----egOrgMap15--------------------------------------------------------------- egOrgMap("Mus musculus") egOrgMap("Homo sapiens") ## ----peaksNearBDP16----------------------------------------------------------- data(myPeakList) data(TSS.human.NCBI36) seqlevelsStyle(TSS.human.NCBI36) <- seqlevelsStyle(myPeakList) annotatedBDP <- peaksNearBDP(myPeakList[1:10,], AnnotationData=TSS.human.NCBI36, MaxDistance=5000) annotatedBDP$peaksWithBDP c(annotatedBDP$percentPeaksWithBDP, annotatedBDP$n.peaks, annotatedBDP$n.peaksWithBDP) ## ----peakPermTest------------------------------------------------------------- if(interactive()){ library(TxDb.Hsapiens.UCSC.hg19.knownGene) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene cds <- unique(unlist(cdsBy(txdb))) utr5 <- unique(unlist(fiveUTRsByTranscript(txdb))) utr3 <- unique(unlist(threeUTRsByTranscript(txdb))) set.seed(123) utr3 <- utr3[sample.int(length(utr3), 1000)] pt <- peakPermTest(utr3, utr5[sample.int(length(utr5), 1000)], maxgap=500, TxDb=txdb, seed=1, force.parallel=FALSE) plot(pt) ## highly relevant peaks ol <- findOverlaps(cds, utr3, maxgap=1) pt1 <- peakPermTest(utr3, c(cds[sample.int(length(cds), 500)], cds[queryHits(ol)][sample.int(length(ol), 500)]), maxgap=500, TxDb=txdb, seed=1, force.parallel=FALSE) plot(pt1) } ## ----peakPermTest1------------------------------------------------------------ if(interactive()){ data(HOT.spots) data(wgEncodeTfbsV3) hotGR <- reduce(unlist(HOT.spots)) removeOl <- function(.ele){ ol <- findOverlaps(.ele, hotGR) if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))] .ele } temp <- tempfile() download.file(file.path("http://hgdownload.cse.ucsc.edu", "goldenPath", "hg19", "encodeDCC", "wgEncodeRegTfbsClustered", "wgEncodeRegTfbsClusteredV3.bed.gz"), temp) data <- toGRanges(gzfile(temp, "r"), header=FALSE, format="others", colNames = c("seqnames", "start", "end", "TF")) unlink(temp) data <- split(data, data$TF) TAF1 <- removeOl(data[["TAF1"]]) TEAD4 <- removeOl(data[["TEAD4"]]) pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3), N=length(TAF1)) pt <- peakPermTest(TAF1, TEAD4, pool=pool, ntimes=1000) plot(pt) } ## ----citation, echo=FALSE----------------------------------------------------- citation(package="ChIPpeakAnno") ## ----sessionInfo-------------------------------------------------------------- sessionInfo()