## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)

## ----"install", eval = FALSE--------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#        install.packages("BiocManager")
#    }
#  
#  BiocManager::install("FindIT2")
#  
#  # Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## -----------------------------------------------------------------------------
citation("FindIT2")

## ----"start", message=FALSE---------------------------------------------------
# load packages
# If you want to run this manual, please check you have install below packages.
library(FindIT2)
library(TxDb.Athaliana.BioMart.plantsmart28)
library(SummarizedExperiment)

library(dplyr)
library(ggplot2)

# because of the fa I use, I change the seqlevels of Txdb to make the chrosome levels consistent
Txdb <- TxDb.Athaliana.BioMart.plantsmart28
seqlevels(Txdb) <- c(paste0("Chr", 1:5), "M", "C")

all_geneSet <- genes(Txdb)


## ----eval=FALSE---------------------------------------------------------------
#  # when you make sure your first metadata column is peak name
#  colnames(mcols(yourGR))[1] <- "feature_id"
#  
#  # or you just add a column
#  yourGR$feature_id <- paste0("peak_", seq_len(length(yourGR)))
#  

## ----"prepare data"-----------------------------------------------------------
# load the test ChIP peak bed
ChIP_peak_path <- system.file("extdata", "ChIP.bed.gz", package = "FindIT2")
ChIP_peak_GR <- loadPeakFile(ChIP_peak_path)

# you can see feature_id is in your first column of metadata
ChIP_peak_GR

## ----"mm_nearestgene"---------------------------------------------------------
mmAnno_nearestgene <- mm_nearestGene(peak_GR = ChIP_peak_GR,
                                     Txdb = Txdb)

mmAnno_nearestgene


## -----------------------------------------------------------------------------
plot_annoDistance(mmAnno = mmAnno_nearestgene)

## -----------------------------------------------------------------------------
getAssocPairNumber(mmAnno = mmAnno_nearestgene)

getAssocPairNumber(mmAnno = mmAnno_nearestgene,
                   output_summary = TRUE)

# you can see all peak's related gene number is 1 because I use the nearest gene mode
getAssocPairNumber(mmAnno_nearestgene, output_type = "feature_id")

getAssocPairNumber(mmAnno = mmAnno_nearestgene,
                   output_type = "feature_id",
                   output_summary = TRUE)

## -----------------------------------------------------------------------------
plot_peakGeneAlias_summary(mmAnno_nearestgene)
plot_peakGeneAlias_summary(mmAnno_nearestgene, output_type = "feature_id")

## ----"mm_geneBound"-----------------------------------------------------------
# The genes_Chr5 is all gene set in Chr5
genes_Chr5 <- names(all_geneSet[seqnames(all_geneSet) == "Chr5"])

# The genes_Chr5_notAnno is gene set which is not linked by peak
genes_Chr5_notAnno <- genes_Chr5[!genes_Chr5 %in% unique(mmAnno_nearestgene$gene_id)]

# The genes_Chr5_tAnno is gene set which is linked by peak
genes_Chr5_Anno <- unique(mmAnno_nearestgene$gene_id)

# mm_geneBound will tell you there 5 genes in your input_genes not be annotated
# and it will use the distanceToNearest to find nearest peak of these genes
mmAnno_geneBound <- mm_geneBound(peak_GR = ChIP_peak_GR,
                                 Txdb = Txdb,
                                 input_genes = c(genes_Chr5_Anno[1:5], genes_Chr5_notAnno[1:5]))

# all of your input_genes have related peaks
mmAnno_geneBound


## ----"mm_geneScan"------------------------------------------------------------
mmAnno_geneScan <- mm_geneScan(peak_GR = ChIP_peak_GR,
                               Txdb = Txdb,
                               upstream = 2e4,
                               downstream = 2e4)

mmAnno_geneScan


## -----------------------------------------------------------------------------
getAssocPairNumber(mmAnno_geneScan)

getAssocPairNumber(mmAnno_geneScan, output_type = "feature_id")

## -----------------------------------------------------------------------------
plot_peakGeneAlias_summary(mmAnno_geneScan)
plot_peakGeneAlias_summary(mmAnno_geneScan, output_type = "feature_id")

## -----------------------------------------------------------------------------
# Here you can see the score column in metadata
ChIP_peak_GR

# I can rename it into feature_score
colnames(mcols(ChIP_peak_GR))[2] <- "feature_score"

ChIP_peak_GR


## ----"calcRP_TFHit"-----------------------------------------------------------
# if you just want to get RP_df, you can set report_fullInfo FALSE
fullRP_hit <- calcRP_TFHit(mmAnno = mmAnno_geneScan,
                           Txdb = Txdb,
                           decay_dist = 1000,
                           report_fullInfo = TRUE)

# if you set report_fullInfo to TRUE, the result will be a GRange object
# it maintain the mmAnno_geneScan result and add other column, which represent  
# the contribution of each peak to the final RP of the gene

fullRP_hit

# or you can directly extract from metadata of your result
peakRP_gene <- metadata(fullRP_hit)$peakRP_gene

# The result is ordered by the sumRP and you can decide the target threshold by yourself
peakRP_gene


## ----"calcRP_bw"--------------------------------------------------------------

bwFile <- system.file("extdata", "E50h_sampleChr5.bw", package = "FindIT2")
RP_df <- calcRP_coverage(bwFile = bwFile,
                         Txdb = Txdb,
                         Chrs_included = "Chr5")

head(RP_df)


## ----"calcRP_region"----------------------------------------------------------
data("ATAC_normCount")

# This ATAC peak is the merge peak set from E50h-72h
ATAC_peak_path <- system.file("extdata", "ATAC.bed.gz", package = "FindIT2")
ATAC_peak_GR <- loadPeakFile(ATAC_peak_path)

mmAnno_regionRP <- mm_geneScan(ATAC_peak_GR,
                               Txdb,
                               upstream = 2e4,
                               downstream = 2e4)

# This ATAC_normCount is the peak count matrix normilzed by DESeq2
calcRP_region(mmAnno = mmAnno_regionRP,
              peakScoreMt = ATAC_normCount,
              Txdb = Txdb,
              Chrs_included = "Chr5") -> regionRP

# The sumRP is a matrix representing your geneRP in every samples
sumRP <- assays(regionRP)$sumRP
head(sumRP)

# The fullRP is a detailed peak-RP-gene relationship
fullRP <- assays(regionRP)$fullRP
head(fullRP)


## ----"findITarget"------------------------------------------------------------
data("RNADiff_LEC2_GR")
integrate_ChIP_RNA(result_geneRP = peakRP_gene,
                   result_geneDiff = RNADiff_LEC2_GR) -> merge_result

# you can see compared with down gene, there are more up gene on the top rank in ChIP-Seq data
# In the meanwhile, the number of down gene is less than up
merge_result

# if you want to extract merge target data
target_result <- merge_result$data
target_result


## ----"prepare_for_findIT"-----------------------------------------------------

# Here I take the top50 gene from integrate_ChIP_RNA as my interesting gene set.
input_genes <- target_result$gene_id[1:50]

# I use mm_geneBound to find related peak, which I will take as my interesting peak set.
related_peaks <- mm_geneBound(peak_GR = ATAC_peak_GR,
                              Txdb = Txdb,
                              input_genes = input_genes)
input_feature_id <- unique(related_peaks$feature_id)

# AT1G28300 is LEC2 tair ID
# I add a column named TF_id into my ChIP Seq GR
ChIP_peak_GR$TF_id <- "AT1G28300"

# And I also add some other public ChIP-Seq data
TF_GR_database_path <- system.file("extdata", "TF_GR_database.bed.gz", package = "FindIT2")
TF_GR_database <- loadPeakFile(TF_GR_database_path)
TF_GR_database

# rename feature_id column into TF_id
# because the true thing I am interested in is TF set, not each TF binding site.
colnames(mcols(TF_GR_database))[1] <- "TF_id"

# merge LEC2 ChIP GR
TF_GR_database <- c(TF_GR_database, ChIP_peak_GR)

TF_GR_database


## ----"findIT_enrichWilcox"----------------------------------------------------
findIT_enrichWilcox(input_feature_id = input_feature_id, 
                    peak_GR = ATAC_peak_GR, 
                    TF_GR_database = TF_GR_database) -> result_enrichWilcox

# you can see AT1G28300 is top1
result_enrichWilcox


## ----"findIT_enrichFisher"----------------------------------------------------
findIT_enrichFisher(input_feature_id = input_feature_id, 
                   peak_GR = ATAC_peak_GR, 
                   TF_GR_database = TF_GR_database) -> result_enrichFisher

# you can see AT1G28300 is top1
result_enrichFisher


## -----------------------------------------------------------------------------
# Here I use the top 4 TF id to calculate jaccard similarity of TF
jaccard_findIT_enrichFisher(input_feature_id = input_feature_id,
                           peak_GR = ATAC_peak_GR,
                           TF_GR_database = TF_GR_database,
                           input_TF_id = result_enrichFisher$TF_id[1:4]) -> enrichAll_jaccard

# it report the jaccard similarity of TF you input

# but here I make the TF's own jaccard similarity 0, which is useful for heatmap
# If you want to convert it to 1, you can just use 
# diag(enrichAll_jaccard) <- 1

enrichAll_jaccard


## -----------------------------------------------------------------------------
data("TF_target_database")

# it should have two column named TF_id and target_gene.
head(TF_target_database)


## ----"findIT_TTPair"----------------------------------------------------------
# By default, TTpair will consider all target gene as background
# Because I just use part of true TF_target_database, the background calculation
# is not correct. 
# so I use all gene in Txdb as gene_background.

result_TTpair <- findIT_TTPair(input_genes = input_genes,
                               TF_target_database = TF_target_database,
                               gene_background = names(all_geneSet))

# you can see AT1G28300 is top1
result_TTpair


## -----------------------------------------------------------------------------

# Here I use the all TF_id because I just have three TF in result_TTpair
# For you, you can select top N TF_id as input_TF_id
jaccard_findIT_TTpair(input_genes = input_genes,
                      TF_target_database = TF_target_database,
                      input_TF_id = result_TTpair$TF_id) -> TTpair_jaccard

# Here I make the TF's own jaccard similarity 0, which is useful for heatmap
# If you want to convert it to 1, you can just use 
# diag(TTpair_jaccard) <- 1
TTpair_jaccard


## ----"findIT_TFHit"-----------------------------------------------------------
# For repeatability of results, you should set seed.
set.seed(20160806)

# the meaning of scan_dist and decay_dist is same as calcRP_TFHit
# the Chrs_included control the chromosome your background in 
# the background_number control the number of background gene

# If you want to compare the TF enrichment in your input_genes with other gene set
# you can input other gene set id into background_genes
result_TFHit <- findIT_TFHit(input_genes = input_genes,
                             Txdb = Txdb,
                             TF_GR_database = TF_GR_database,
                             scan_dist = 2e4,
                             decay_dist = 1e3,
                             Chrs_included = "Chr5",
                             background_number = 3000)

# you can see AT1G28300 is top1
result_TFHit


## -----------------------------------------------------------------------------
# For repeatability of results, you should set seed.
set.seed(20160806)
result_findIT_regionRP <- findIT_regionRP(regionRP = regionRP,
                                          Txdb = Txdb,
                                          TF_GR_database = TF_GR_database,
                                          input_genes = input_genes,
                                          background_number = 3000)

# The result object of findIT_regionRP is MultiAssayExperiment, same as calcRP_region
# TF_percentMean is the mean influence of TF on input genes minus background, 
# which represent the total influence of specific TF on your input genes
TF_percentMean <- assays(result_findIT_regionRP)$TF_percentMean
TF_pvalue <- assays(result_findIT_regionRP)$TF_pvalue


## -----------------------------------------------------------------------------
TF_percentMean

heatmap(TF_percentMean, Colv = NA, scale = "none")

## -----------------------------------------------------------------------------
metadata(result_findIT_regionRP)$percent_df %>% 
    filter(sample == "E5_0h_R1") %>% 
    select(gene_id, percent, TF_id) %>% 
    tidyr::pivot_wider(values_from = percent, names_from = gene_id) -> E50h_TF_percent

E50h_TF_mt <- as.matrix(E50h_TF_percent[, -1])
rownames(E50h_TF_mt) <- E50h_TF_percent$TF_id

E50h_TF_mt

heatmap(E50h_TF_mt, scale = "none")


## -----------------------------------------------------------------------------
metadata(result_findIT_regionRP)

metadata(result_findIT_regionRP)$percent_df %>% 
  filter(TF_id == "AT1G28300") %>% 
  select(-TF_id) %>% 
  tidyr::pivot_wider(names_from = sample, values_from = percent) -> LEC2_percent_df

LEC2_percent_mt <- as.matrix(LEC2_percent_df[, -1])
rownames(LEC2_percent_mt) <- LEC2_percent_df$gene_id

heatmap(LEC2_percent_mt, Colv = NA, scale = "none")
  

## ----eval = FALSE-------------------------------------------------------------
#  # Before using shiny function, you should merge the regionRP and result_findIT_regionRP firstly.
#  merge_result <- c(regionRP, result_findIT_regionRP)
#  InteractiveFindIT2::shinyParse_findIT_regionRP(merge_result = merge_result, mode = "gene")
#  
#  InteractiveFindIT2::shinyParse_findIT_regionRP(merge_result = merge_result,mode = "TF")

## -----------------------------------------------------------------------------

# For repeatability of results, you should set seed.
set.seed(20160806)
findIT_MARA(input_feature_id = input_feature_id,
            peak_GR = ATAC_peak_GR,
            peakScoreMt = ATAC_normCount,
            TF_GR_database = TF_GR_database,
            log = TRUE,
            meanScale = TRUE) -> result_findIT_MARA


# Please note that you should add the total motif scan data in TF_GR_database
# Here I just use the test public ChIP-Seq data, so the result is not valuable
result_findIT_MARA


## -----------------------------------------------------------------------------
# when you get the zscale value from findIT_MARA,
# you can use integrate_replicates to integrate replicate zscale by setting type as "rank_zscore"
# Here each replicate are combined using Stouffer’s method
MARA_mt <- as.matrix(result_findIT_MARA[, -1])

rownames(MARA_mt) <- result_findIT_MARA$TF_id

MARA_colData <- data.frame(row.names = colnames(MARA_mt),
                           type = gsub("_R[0-9]", "", colnames(MARA_mt))
                           )


integrate_replicates(mt = MARA_mt,
                     colData = MARA_colData,
                     type = "rank_zscore")


## -----------------------------------------------------------------------------
list(TF_Hit = result_TFHit,
     enrichFisher = result_enrichFisher,
     wilcox = result_enrichWilcox,
     TT_pair = result_TTpair
     ) -> rank_merge_list
purrr::map(names(rank_merge_list), .f = function(x){
    data <- rank_merge_list[[x]]
    data %>% 
        select(TF_id, rank) %>% 
        mutate(source = x) -> data
    return(data)
}) %>% 
    do.call(rbind, .) %>% 
    tidyr::pivot_wider(names_from = source, values_from = rank) -> rank_merge_df

rank_merge_df

# we only select TF which appears in all source
rank_merge_df <- rank_merge_df[rowSums(is.na(rank_merge_df)) == 0, ]

rank_merge_mt <- as.matrix(rank_merge_df[, -1])
rownames(rank_merge_mt) <- rank_merge_df$TF_id

colData <- data.frame(row.names = colnames(rank_merge_mt),
                      type = rep("source", ncol(rank_merge_mt)))

integrate_replicates(mt = rank_merge_mt, colData = colData, type = "rank")


## -----------------------------------------------------------------------------
data("RNA_normCount")

peak_GR <- loadPeakFile(ATAC_peak_path)[1:100]
mmAnno <- mm_geneScan(peak_GR,Txdb)

ATAC_colData <- data.frame(row.names = colnames(ATAC_normCount),
                           type = gsub("_R[0-9]", "", colnames(ATAC_normCount))
                           )

integrate_replicates(ATAC_normCount, ATAC_colData) -> ATAC_normCount_merge
RNA_colData <- data.frame(row.names = colnames(RNA_normCount),
                          type = gsub("_R[0-9]", "", colnames(RNA_normCount))
                          )
integrate_replicates(RNA_normCount, RNA_colData) -> RNA_normCount_merge

peakGeneCor(mmAnno = mmAnno,
            peakScoreMt = ATAC_normCount_merge,
            geneScoreMt = RNA_normCount_merge,
            parallel = FALSE) -> mmAnnoCor


## -----------------------------------------------------------------------------
subset(mmAnnoCor, cor > 0.8) %>% 
  getAssocPairNumber()

## -----------------------------------------------------------------------------
plot_peakGeneCor(mmAnnoCor = mmAnnoCor,
                 select_gene = "AT5G01075")

plot_peakGeneCor(mmAnnoCor = subset(mmAnnoCor, cor > 0.95),
                 select_gene = "AT5G01075")

plot_peakGeneCor(mmAnnoCor = subset(mmAnnoCor, cor > 0.95),
                 select_gene = "AT5G01075") +
  geom_point(aes(color = time_point))

plot_peakGeneAlias_summary(mmAnno = mmAnnoCor,
                           mmAnno_corFilter = subset(mmAnnoCor, cor > 0.8))


## ----eval=FALSE---------------------------------------------------------------
#  InteractiveFindIT2::shinyParse_peakGeneCor(mmAnnoCor)

## -----------------------------------------------------------------------------
enhancerPromoterCor(peak_GR = peak_GR[1:100],
                    Txdb = Txdb,
                    peakScoreMt = ATAC_normCount,
                    up_scanPromoter = 500,
                    down_scanPromoter = 500,
                    up_scanEnhancer = 2000,
                    down_scanEnhacner = 2000,
                    parallel = FALSE) -> mmAnnoCor_linkEP

## -----------------------------------------------------------------------------
plot_peakGeneCor(mmAnnoCor = mmAnnoCor_linkEP,
                 select_gene = "AT5G01075") -> p

p

p$data$type <- gsub("_R[0-9]", "", p$data$time_point)
p$data$type <- factor(p$data$type, levels = unique(p$data$type))

p +
    ggplot2::geom_point(aes(color = type))


## -----------------------------------------------------------------------------
plot_peakGeneAlias_summary(mmAnno = mmAnnoCor_linkEP,
                           mmAnno_corFilter = subset(mmAnnoCor_linkEP, cor > 0.8))

## ----eval=FALSE---------------------------------------------------------------
#  InteractiveFindIT2::shinyParse_peakGeneCor(mmAnnoCor_linkEP)

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()