## ---- echo=FALSE, results="hide", warning=FALSE-------------------------------
suppressPackageStartupMessages({
    library(ChIPpeakAnno)
    library(org.Hs.eg.db)
    library(GenomicFeatures)
    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    library(rtracklayer)
    library(GO.db)
    library(EnsDb.Hsapiens.v75)
    library(BSgenome.Ecoli.NCBI.20080805)
})
knitr::opts_chunk$set(warning=FALSE, message=FALSE)

## ----quickStart---------------------------------------------------------------
library(ChIPpeakAnno)
## import the MACS output
macs <- system.file("extdata", "MACS_peaks.xls", package="ChIPpeakAnno")
macsOutput <- toGRanges(macs, format="MACS")
## annotate the peaks with precompiled ensembl annotation
data(TSS.human.GRCh38)
macs.anno <- annotatePeakInBatch(macsOutput, AnnotationData=TSS.human.GRCh38)
## add gene symbols
library(org.Hs.eg.db)
macs.anno <- addGeneIDs(annotatedPeak=macs.anno, 
                        orgAnn="org.Hs.eg.db", 
                        IDs2Add="symbol")

if(interactive()){## annotate the peaks with UCSC annotation
    library(GenomicFeatures)
    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    ucsc.hg38.knownGene <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene)
    macs.anno <- annotatePeakInBatch(macsOutput, 
                                     AnnotationData=ucsc.hg38.knownGene)
    macs.anno <- addGeneIDs(annotatedPeak=macs.anno, 
                            orgAnn="org.Hs.eg.db", 
                            feature_id_type="entrez_id",
                            IDs2Add="symbol")
}

## ----workflow19, fig.cap="venn diagram of overlaps for duplicated experiments"----
bed <- system.file("extdata", "MACS_output.bed", package="ChIPpeakAnno")
gr1 <- toGRanges(bed, format="BED", header=FALSE) 
## one can also try import from rtracklayer
library(rtracklayer)
gr1.import <- import(bed, format="BED")
identical(start(gr1), start(gr1.import))
gr1[1:2]
gr1.import[1:2] #note the name slot is different from gr1
gff <- system.file("extdata", "GFF_peaks.gff", package="ChIPpeakAnno")
gr2 <- toGRanges(gff, format="GFF", header=FALSE, skip=3)
ol <- findOverlapsOfPeaks(gr1, gr2)
makeVennDiagram(ol,
                fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2"))

## ----workflow20,fig.cap="Pie chart of common peaks among features"------------
pie1(table(ol$overlappingPeaks[["gr1///gr2"]]$overlapFeature))

## ----workflow21---------------------------------------------------------------
overlaps <- ol$peaklist[["gr1///gr2"]]
## ============== old style ===========
## data(TSS.human.GRCh37) 
## overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, 
##                                      output="overlapping", maxgap=5000L)
## overlaps.anno <- addGeneIDs(overlaps.anno, "org.Hs.eg.db", "symbol")
## ============== new style ===========
library(EnsDb.Hsapiens.v75) ##(hg19)
## create annotation file from EnsDb or TxDb
annoData <- toGRanges(EnsDb.Hsapiens.v75, feature="gene")
annoData[1:2]
overlaps.anno <- annotatePeakInBatch(overlaps, AnnotationData=annoData, 
                                    output="overlapping", maxgap=5000L)
overlaps.anno$gene_name <- 
    annoData$gene_name[match(overlaps.anno$feature,
                             names(annoData))]
head(overlaps.anno)

## ----workflow22,fig.cap="Distribution of aggregated peak scores or peak numbers around transcript start sites.",fig.width=8,fig.height=6----
gr1.copy <- gr1
gr1.copy$score <- 1
binOverFeature(gr1, gr1.copy, annotationData=annoData,
               radius=5000, nbins=10, FUN=c(sum, length),
               ylab=c("score", "count"), 
               main=c("Distribution of aggregated peak scores around TSS", 
                      "Distribution of aggregated peak numbers around TSS"))

## ----workflow23,fig.cap="Peak distribution over different genomic features.",fig.width=10,fig.height=4----
if(require(TxDb.Hsapiens.UCSC.hg19.knownGene)){
    aCR<-assignChromosomeRegion(gr1, nucleotideLevel=FALSE, 
                           precedence=c("Promoters", "immediateDownstream", 
                                         "fiveUTRs", "threeUTRs", 
                                         "Exons", "Introns"), 
                           TxDb=TxDb.Hsapiens.UCSC.hg19.knownGene)
    barplot(aCR$percentage)
}

## ----findOverlapsOfPeaks3-----------------------------------------------------
peaks1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", 
                              "2", "6", "6", "6", "6", "5"),
                   ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 
                                          3123260, 3857501, 201089, 1543200, 
                                          1557200, 1563000, 1569800, 167889600),
                                  end= c(967754, 2010997, 2496804, 3075969, 
                                         3123360, 3857601, 201089, 1555199,
                                         1560599, 1565199, 1573799, 167893599),
                                  names=paste("Site", 1:12, sep="")),
                  strand="+")

peaks2 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", "3", 
                                     "4", "5", "6", "6", "6", "6", "6", "5"),
                          ranges=IRanges(start=c(967659, 2010898, 2496700, 
                                                 3075866, 3123260, 3857500, 
                                                 96765, 201089, 249670, 307586, 
                                                 312326, 385750, 1549800, 
                                                 1554400, 1565000, 1569400,
                                                 167888600), 
                                         end=c(967869, 2011108, 2496920, 
                                               3076166,3123470, 3857780, 
                                               96985, 201299, 249890, 307796, 
                                               312586, 385960, 1550599, 1560799,
                                               1565399, 1571199, 167888999), 
                                         names=paste("t", 1:17, sep="")),
                          strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", 
                                   "-", "-", "-", "+", "+", "+", "+", "+"))

ol <- findOverlapsOfPeaks(peaks1, peaks2, maxgap=1000)
peaklist <- ol$peaklist

## ----overlappingPeaks4,fig.cap="Pie chart of common peaks among features."----
overlappingPeaks <- ol$overlappingPeaks
names(overlappingPeaks)
dim(overlappingPeaks[["peaks1///peaks2"]])
overlappingPeaks[["peaks1///peaks2"]][1:2, ]
pie1(table(overlappingPeaks[["peaks1///peaks2"]]$overlapFeature))

## ----overlappingPeaks5--------------------------------------------------------
peaklist[["peaks1///peaks2"]]

## ----6------------------------------------------------------------------------
peaklist[["peaks1"]]

## ----7------------------------------------------------------------------------
peaklist[["peaks2"]]

## ----findOverlapsOfPeaks8,fig.cap="venn diagram of overlaps",fig.width=6,fig.height=6----
makeVennDiagram(ol, totalTest=1e+2,
                fill=c("#009E73", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2"), #circle border color
                cat.col=c("#D55E00", "#0072B2"))

## ----VennerableFigure---------------------------------------------------------
#     install.packages("Vennerable", repos="http://R-Forge.R-project.org", 
#                     type="source")
#     library(Vennerable)
#     venn_cnt2venn <- function(venn_cnt){
#         n <- which(colnames(venn_cnt)=="Counts") - 1
#         SetNames=colnames(venn_cnt)[1:n]
#         Weight=venn_cnt[,"Counts"]
#         names(Weight) <- apply(venn_cnt[,1:n], 1, base::paste, collapse="")
#         Venn(SetNames=SetNames, Weight=Weight)
#     }
# 
#     v <- venn_cnt2venn(ol$venn_cnt)
#     plot(v)

## ----findOverlapsOfPeaks9,fig.cap="venn diagram of overlaps for three input peak lists",fig.width=6,fig.height=6----
peaks3 <- GRanges(seqnames=c("1", "2", "3", "4", "5", 
                             "6", "1", "2", "3", "4"),
                   ranges=IRanges(start=c(967859, 2010868, 2496500, 3075966,
                                          3123460, 3851500, 96865, 201189, 
                                          249600, 307386),
                                  end= c(967969, 2011908, 2496720, 3076166,
                                         3123470, 3857680, 96985, 201299, 
                                         249890, 307796),
                                  names=paste("p", 1:10, sep="")),
                  strand=c("+", "+", "+", "+", "+", 
                           "+", "-", "-", "-", "-"))

ol <- findOverlapsOfPeaks(peaks1, peaks2, peaks3, maxgap=1000, 
                          connectedPeaks="min")
makeVennDiagram(ol, totalTest=1e+2,
                fill=c("#CC79A7", "#56B4E9", "#F0E442"), # circle fill color
                col=c("#D55E00", "#0072B2", "#E69F00"), #circle border color
                cat.col=c("#D55E00", "#0072B2", "#E69F00"))

## ----annoGRgene---------------------------------------------------------------
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene, feature="gene")
annoData

## ----annotatePeakInBatchByannoGR----------------------------------------------
data(myPeakList)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
                                     AnnotationData = annoData)
annotatedPeak[1:3]

## ----annotatePeakInBatch1-----------------------------------------------------
data(TSS.human.NCBI36)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6], 
                 AnnotationData=TSS.human.NCBI36)
annotatedPeak[1:3]

## ----annotatePeakInBatch2-----------------------------------------------------
myPeak1 <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", 
                              "2", "6", "6", "6", "6", "5"),
                   ranges=IRanges(start=c(967654, 2010897, 2496704, 3075869, 
                                          3123260, 3857501, 201089, 1543200, 
                                          1557200, 1563000, 1569800, 167889600),
                                  end= c(967754, 2010997, 2496804, 3075969, 
                                         3123360, 3857601, 201089, 1555199,
                                         1560599, 1565199, 1573799, 167893599),
                                  names=paste("Site", 1:12, sep="")))

TFbindingSites <- GRanges(seqnames=c("1", "2", "3", "4", "5", "6", "1", "2", 
                                     "3", "4", "5", "6", "6", "6", "6", "6",
                                     "5"),
                          ranges=IRanges(start=c(967659, 2010898, 2496700, 
                                                 3075866, 3123260, 3857500, 
                                                 96765, 201089, 249670, 307586, 
                                                 312326, 385750, 1549800, 
                                                 1554400, 1565000, 1569400,
                                                 167888600), 
                                         end=c(967869, 2011108, 2496920, 
                                               3076166,3123470, 3857780, 
                                               96985, 201299, 249890, 307796, 
                                               312586, 385960, 1550599, 1560799,
                                               1565399, 1571199, 167888999), 
                                         names=paste("t", 1:17, sep="")),
                          strand=c("+", "+", "+", "+", "+", "+", "-", "-", "-", 
                                   "-", "-", "-", "+", "+", "+", "+", "+"))

annotatedPeak2 <- annotatePeakInBatch(myPeak1, AnnotationData=TFbindingSites)
annotatedPeak2[1:3]

## ----pie1,fig.cap="Pie chart of peak distribution among features",fig.width=5,fig.height=5----
pie1(table(as.data.frame(annotatedPeak2)$insideFeature))

## ----annotatedPromoter--------------------------------------------------------
library(ChIPpeakAnno)
data(myPeakList)
data(TSS.human.NCBI36)
annotationData <- promoters(TSS.human.NCBI36, upstream=5000, downstream=500)
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6,], 
                                     AnnotationData=annotationData,
                                     output="overlapping")
annotatedPeak[1:3]

## -----------------------------------------------------------------------------
annotatedPeak <- annotatePeakInBatch(myPeakList[1:6],
                                     AnnotationData = annoData,
                                     output="both", maxgap=5000)
head(annotatedPeak)

## ----addGeneIDs18-------------------------------------------------------------
data(annotatedPeak)
library(org.Hs.eg.db)
addGeneIDs(annotatedPeak[1:6], orgAnn="org.Hs.eg.db", IDs2Add=c("symbol"))
addGeneIDs(annotatedPeak$feature[1:6], orgAnn="org.Hs.eg.db", 
           IDs2Add=c("symbol"))

## ----getAllPeakSequence12-----------------------------------------------------
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
                 ranges=IRanges(start=c(100, 500), 
                                end=c(300, 600), 
                                names=c("peak1", "peak2")))
library(BSgenome.Ecoli.NCBI.20080805)
peaksWithSequences <- getAllPeakSequence(peaks, upstream=20, 
                                         downstream=20, genome=Ecoli)

## ----write2FASTA13------------------------------------------------------------
write2FASTA(peaksWithSequences,"test.fa")

## ----heatmap,fig.cap="Heatmap of aligned features",fig.width=4,fig.height=6----
path <- system.file("extdata", package="ChIPpeakAnno")
files <- dir(path, "broadPeak")
data <- sapply(file.path(path, files), toGRanges, format="broadPeak")
names(data) <- gsub(".broadPeak", "", files)
ol <- findOverlapsOfPeaks(data)
#makeVennDiagram(ol)
features <- ol$peaklist[[length(ol$peaklist)]]
wid <- width(features)
feature.recentered <- feature.center <- features
start(feature.center) <- start(features) + floor(wid/2)
width(feature.center) <- 1
start(feature.recentered) <- start(feature.center) - 2000
end(feature.recentered) <- end(feature.center) + 2000
## here we also suggest importData function in bioconductor trackViewer package 
## to import the coverage.
## compare rtracklayer, it will save you time when handle huge dataset.
library(rtracklayer)
files <- dir(path, "bigWig")
if(.Platform$OS.type != "windows"){
    cvglists <- sapply(file.path(path, files), import, 
                       format="BigWig", 
                       which=feature.recentered, 
                       as="RleList")
}else{## rtracklayer can not import bigWig files on Windows
    load(file.path(path, "cvglist.rds"))
}
names(cvglists) <- gsub(".bigWig", "", files)
sig <- featureAlignedSignal(cvglists, feature.center, 
                            upstream=2000, downstream=2000) 
heatmap <- featureAlignedHeatmap(sig, feature.center, 
                                 upstream=2000, downstream=2000,
                                 upper.extreme=c(3,.5,4))

## ----distribution,fig.cap="Distribution of aligned features",fig.width=6,fig.height=6----
featureAlignedDistribution(sig, feature.center, 
                           upstream=2000, downstream=2000,
                           type="l")

## ----summarizePatternInPeaks17------------------------------------------------
peaks <- GRanges(seqnames=c("NC_008253", "NC_010468"),
                 ranges=IRanges(start=c(100, 500), 
                                end=c(300, 600), 
                                names=c("peak1", "peak2")))
filepath <- system.file("extdata", "examplePattern.fa", package="ChIPpeakAnno")
readLines(filepath)
library(BSgenome.Ecoli.NCBI.20080805)
summarizePatternInPeaks(patternFilePath=filepath, format="fasta", skip=0L, 
                        BSgenomeName=Ecoli, peaks=peaks)

## ----getEnriched14------------------------------------------------------------
library(org.Hs.eg.db)
over <- getEnrichedGO(annotatedPeak[1:500], orgAnn="org.Hs.eg.db", 
                    maxP=0.01, minGOterm=10, 
                    multiAdjMethod="BH",
                    condense=FALSE)
head(over[["bp"]][, -3])
head(over[["cc"]][, -3])
head(over[["mf"]][, -3])

## ----egOrgMap15---------------------------------------------------------------
egOrgMap("Mus musculus")
egOrgMap("Homo sapiens")

## ----peaksNearBDP16-----------------------------------------------------------
data(myPeakList)
data(TSS.human.NCBI36)
seqlevelsStyle(TSS.human.NCBI36) <- seqlevelsStyle(myPeakList)
annotatedBDP <- peaksNearBDP(myPeakList[1:10,], 
                             AnnotationData=TSS.human.NCBI36, 
                             MaxDistance=5000)
annotatedBDP$peaksWithBDP
c(annotatedBDP$percentPeaksWithBDP, 
  annotatedBDP$n.peaks, 
  annotatedBDP$n.peaksWithBDP)

## ----peakPermTest-------------------------------------------------------------
if(interactive()){
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
cds <- unique(unlist(cdsBy(txdb)))
utr5 <- unique(unlist(fiveUTRsByTranscript(txdb)))
utr3 <- unique(unlist(threeUTRsByTranscript(txdb)))
set.seed(123)
utr3 <- utr3[sample.int(length(utr3), 1000)]
pt <- peakPermTest(utr3, 
             utr5[sample.int(length(utr5), 1000)], 
             maxgap=500,
             TxDb=txdb, seed=1,
             force.parallel=FALSE)
plot(pt)
## highly relevant peaks
ol <- findOverlaps(cds, utr3, maxgap=1)
pt1 <- peakPermTest(utr3,
             c(cds[sample.int(length(cds), 500)], 
                cds[queryHits(ol)][sample.int(length(ol), 500)]), 
             maxgap=500, 
             TxDb=txdb, seed=1,
             force.parallel=FALSE)
plot(pt1)
}

## ----peakPermTest1------------------------------------------------------------
if(interactive()){
    data(HOT.spots)
    data(wgEncodeTfbsV3)
    hotGR <- reduce(unlist(HOT.spots))
    removeOl <- function(.ele){
        ol <- findOverlaps(.ele, hotGR)
        if(length(ol)>0) .ele <- .ele[-unique(queryHits(ol))]
        .ele
    }
    temp <- tempfile()
    download.file(file.path("http://hgdownload.cse.ucsc.edu", 
                            "goldenPath", "hg19", "encodeDCC", 
                            "wgEncodeRegTfbsClustered", 
                            "wgEncodeRegTfbsClusteredV3.bed.gz"), temp)
    data <- toGRanges(gzfile(temp, "r"), header=FALSE, format="others", 
                      colNames = c("seqnames", "start", "end", "TF"))
    unlink(temp)
    data <- split(data, data$TF)
    TAF1 <- removeOl(data[["TAF1"]])
    TEAD4 <- removeOl(data[["TEAD4"]])
    pool <- new("permPool", grs=GRangesList(wgEncodeTfbsV3), N=length(TAF1))
    pt <- peakPermTest(TAF1, TEAD4, pool=pool, ntimes=1000)
    plot(pt)
}

## ----citation, echo=FALSE-----------------------------------------------------
citation(package="ChIPpeakAnno")

## ----sessionInfo--------------------------------------------------------------
sessionInfo()