## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE----------------
## Track time spent on making the vignette
startTime <- Sys.time()

## Bib setup
library("RefManageR")

## Write bibliography information
bib <- c(
    R = citation(),
    AnnotationDbi = RefManageR::BibEntry(
        bibtype = "manual",
        key = "AnnotationDbi",
        author = "Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li",
        title = "AnnotationDbi: Annotation Database Interface",
        year = 2017, doi = "10.18129/B9.bioc.AnnotationDbi"
    ),
    BiocParallel = citation("BiocParallel"),
    BiocStyle = citation("BiocStyle"),
    derfinder = citation("derfinder")[1],
    DESeq2 = citation("DESeq2"),
    sessioninfo = citation("sessioninfo"),
    downloader = citation("downloader"),
    EnsDb.Hsapiens.v79 = citation("EnsDb.Hsapiens.v79"),
    GEOquery = citation("GEOquery"),
    GenomeInfoDb = RefManageR::BibEntry(
        bibtype = "manual",
        key = "GenomeInfoDb",
        author = "Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès",
        title = "GenomeInfoDb: Utilities for manipulating chromosome and other 'seqname' identifiers",
        year = 2017, doi = "10.18129/B9.bioc.GenomeInfoDb"
    ),
    GenomicFeatures = citation("GenomicFeatures"),
    GenomicRanges = citation("GenomicRanges"),
    IRanges = citation("IRanges"),
    knitr = citation("knitr")[3],
    org.Hs.eg.db = citation("org.Hs.eg.db"),
    RCurl = citation("RCurl"),
    recount = citation("recount")[1],
    recountworkflow = citation("recount")[2],
    phenopredict = citation("recount")[3],
    regionReport = citation("regionReport"),
    rentrez = citation("rentrez"),
    RefManageR = citation("RefManageR")[1],
    rmarkdown = citation("rmarkdown")[1],
    rtracklayer = citation("rtracklayer"),
    S4Vectors = RefManageR::BibEntry(
        bibtype = "manual", key = "S4Vectors",
        author = "Hervé Pagès and Michael Lawrence and Patrick Aboyoun",
        title = "S4Vectors: S4 implementation of vector-like and list-like objects",
        year = 2017, doi = "10.18129/B9.bioc.S4Vectors"
    ),
    SummarizedExperiment = RefManageR::BibEntry(
        bibtype = "manual",
        key = "SummarizedExperiment",
        author = "Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès",
        title = "SummarizedExperiment: SummarizedExperiment container",
        year = 2017, doi = "10.18129/B9.bioc.SummarizedExperiment"
    ),
    testthat = citation("testthat")
)

## ----'installDer', eval = FALSE-----------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("recount")
#  
#  ## Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## ----'citation'---------------------------------------------------------------
## Citation info
citation("recount")

## ----'ultraQuick', eval = FALSE-----------------------------------------------
#  ## Load library
#  library("recount")
#  
#  ## Find a project of interest
#  project_info <- abstract_search("GSE32465")
#  
#  ## Download the gene-level RangedSummarizedExperiment data
#  download_study(project_info$project)
#  
#  ## Load the data
#  load(file.path(project_info$project, "rse_gene.Rdata"))
#  
#  ## Browse the project at SRA
#  browse_study(project_info$project)
#  
#  ## View GEO ids
#  colData(rse_gene)$geo_accession
#  
#  ## Extract the sample characteristics
#  geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), geo_characteristics)
#  
#  ## Note that the information for this study is a little inconsistent, so we
#  ## have to fix it.
#  geochar <- do.call(rbind, lapply(geochar, function(x) {
#      if ("cells" %in% colnames(x)) {
#          colnames(x)[colnames(x) == "cells"] <- "cell.line"
#          return(x)
#      } else {
#          return(x)
#      }
#  }))
#  
#  ## We can now define some sample information to use
#  sample_info <- data.frame(
#      run = colData(rse_gene)$run,
#      group = ifelse(grepl("uninduced", colData(rse_gene)$title), "uninduced", "induced"),
#      gene_target = sapply(colData(rse_gene)$title, function(x) {
#          strsplit(strsplit(
#              x,
#              "targeting "
#          )[[1]][2], " gene")[[1]][1]
#      }),
#      cell.line = geochar$cell.line
#  )
#  
#  ## Scale counts by taking into account the total coverage per sample
#  rse <- scale_counts(rse_gene)
#  
#  ## Add sample information for DE analysis
#  colData(rse)$group <- sample_info$group
#  colData(rse)$gene_target <- sample_info$gene_target
#  
#  ## Perform differential expression analysis with DESeq2
#  library("DESeq2")
#  
#  ## Specify design and switch to DESeq2 format
#  dds <- DESeqDataSet(rse, ~ gene_target + group)
#  
#  ## Perform DE analysis
#  dds <- DESeq(dds, test = "LRT", reduced = ~gene_target, fitType = "local")
#  res <- results(dds)
#  
#  ## Explore results
#  plotMA(res, main = "DESeq2 results for SRP009615")
#  
#  ## Make a report with the results
#  library("regionReport")
#  DESeq2Report(dds,
#      res = res, project = "SRP009615",
#      intgroup = c("group", "gene_target"), outdir = ".",
#      output = "SRP009615-results"
#  )

## ----'er_analysis', eval = FALSE----------------------------------------------
#  ## Define expressed regions for study SRP009615, only for chromosome Y
#  regions <- expressed_regions("SRP009615", "chrY",
#      cutoff = 5L,
#      maxClusterGap = 3000L
#  )
#  
#  ## Compute coverage matrix for study SRP009615, only for chromosome Y
#  system.time(rse_ER <- coverage_matrix("SRP009615", "chrY", regions))
#  
#  ## Round the coverage matrix to integers
#  covMat <- round(assays(rse_ER)$counts, 0)
#  
#  ## Get phenotype data for study SRP009615
#  pheno <- colData(rse_ER)
#  
#  ## Complete the phenotype table with the data we got from GEO
#  m <- match(pheno$run, sample_info$run)
#  pheno <- cbind(pheno, sample_info[m, 2:3])
#  
#  ## Build a DESeqDataSet
#  dds_ers <- DESeqDataSetFromMatrix(
#      countData = covMat, colData = pheno,
#      design = ~ gene_target + group
#  )
#  
#  ## Perform differential expression analysis with DESeq2 at the ER-level
#  dds_ers <- DESeq(dds_ers,
#      test = "LRT", reduced = ~gene_target,
#      fitType = "local"
#  )
#  res_ers <- results(dds_ers)
#  
#  ## Explore results
#  plotMA(res_ers, main = "DESeq2 results for SRP009615 (ER-level, chrY)")
#  
#  ## Create a more extensive exploratory report
#  DESeq2Report(dds_ers,
#      res = res_ers,
#      project = "SRP009615 (ER-level, chrY)",
#      intgroup = c("group", "gene_target"), outdir = ".",
#      output = "SRP009615-results-ER-level-chrY"
#  )

## ----'install', eval = FALSE--------------------------------------------------
#  install.packages("BiocManager")
#  BiocManager::install("recount")

## ----'start', message=FALSE---------------------------------------------------
## Load recount R package
library("recount")

## ----'search_abstract'--------------------------------------------------------
## Find a project of interest
project_info <- abstract_search("GSE32465")

## Explore info
project_info

## ----'download'---------------------------------------------------------------
## Download the gene-level RangedSummarizedExperiment data
download_study(project_info$project)

## Load the data
load(file.path(project_info$project, "rse_gene.Rdata"))

## Delete it if you don't need it anymore
unlink(project_info$project, recursive = TRUE)

## ----'explore_rse'------------------------------------------------------------
rse_gene

## This is the sample phenotype data provided by the recount project
colData(rse_gene)

## At the gene level, the row data includes the gene Gencode ids, the gene
## symbols and the sum of the disjoint exons widths, which can be used for
## taking into account the gene length.
rowData(rse_gene)

## At the exon level, you can get the gene Gencode ids from the names of:
# rowRanges(rse_exon)

## ----'browse'-----------------------------------------------------------------
## Browse the project at SRA
browse_study(project_info$project)

## ----'sample_info', warning = FALSE-------------------------------------------
## View GEO ids
colData(rse_gene)$geo_accession

## Extract the sample characteristics
geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), geo_characteristics)

## Note that the information for this study is a little inconsistent, so we
## have to fix it.
geochar <- do.call(rbind, lapply(geochar, function(x) {
    if ("cells" %in% colnames(x)) {
        colnames(x)[colnames(x) == "cells"] <- "cell.line"
        return(x)
    } else {
        return(x)
    }
}))

## We can now define some sample information to use
sample_info <- data.frame(
    run = colData(rse_gene)$run,
    group = ifelse(grepl("uninduced", colData(rse_gene)$title), "uninduced", "induced"),
    gene_target = sapply(colData(rse_gene)$title, function(x) {
        strsplit(strsplit(
            x,
            "targeting "
        )[[1]][2], " gene")[[1]][1]
    }),
    cell.line = geochar$cell.line
)

## ----'scale_counts'-----------------------------------------------------------
## Scale counts by taking into account the total coverage per sample
rse <- scale_counts(rse_gene)

##### Details about counts #####

## Scale counts to 40 million mapped reads. Not all mapped reads are in exonic
## sequence, so the total is not necessarily 40 million.
colSums(assays(rse)$counts) / 1e6

## Compute read counts
rse_read_counts <- read_counts(rse_gene)
## Difference between read counts and number of reads downloaded by Rail-RNA
colSums(assays(rse_read_counts)$counts) / 1e6 -
    colData(rse_read_counts)$reads_downloaded / 1e6
## Check the help page for read_counts() for more details

## ----'add_sample_info'--------------------------------------------------------
## Add sample information for DE analysis
colData(rse)$group <- sample_info$group
colData(rse)$gene_target <- sample_info$gene_target

## ----'de_analysis'------------------------------------------------------------
## Perform differential expression analysis with DESeq2
library("DESeq2")

## Specify design and switch to DESeq2 format
dds <- DESeqDataSet(rse, ~ gene_target + group)

## Perform DE analysis
dds <- DESeq(dds, test = "LRT", reduced = ~gene_target, fitType = "local")
res <- results(dds)

## ----'maplot', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using gene-level data."----
## Explore results
plotMA(res, main = "DESeq2 results for SRP009615")

## ----'make_report', eval = FALSE----------------------------------------------
#  ## Make a report with the results
#  library("regionReport")
#  report <- DESeq2Report(dds,
#      res = res, project = "SRP009615",
#      intgroup = c("group", "gene_target"), outdir = ".",
#      output = "SRP009615-results", nBest = 10, nBestFeatures = 2
#  )

## ----'make_report_real', echo = FALSE, results = 'hide'---------------------------------------------------------------
library("regionReport")

## Load some packages used by regionReport
library("DT")
library("edgeR")
library("ggplot2")
library("RColorBrewer")

## Make it so that the report will be available as a vignette
original <- readLines(system.file("DESeq2Exploration", "DESeq2Exploration.Rmd",
    package = "regionReport"
))
vignetteInfo <- c(
    "vignette: >",
    "  %\\VignetteEngine{knitr::rmarkdown}",
    "  %\\VignetteIndexEntry{Basic DESeq2 results exploration}",
    "  %\\VignetteEncoding{UTF-8}"
)
new <- c(original[1:16], vignetteInfo, original[17:length(original)])
writeLines(new, "SRP009615-results-template.Rmd")

## Now create the report
report <- DESeq2Report(dds,
    res = res, project = "SRP009615",
    intgroup = c("group", "gene_target"), outdir = ".",
    output = "SRP009615-results", device = "png", template = "SRP009615-results-template.Rmd", nBest = 10, nBestFeatures = 2
)

## Clean up
file.remove("SRP009615-results-template.Rmd")

## ----'geneSymbols'----------------------------------------------------------------------------------------------------
## Load required library
library("org.Hs.eg.db")

## Extract Gencode gene ids
gencode <- gsub("\\..*", "", names(recount_genes))

## Find the gene information we are interested in
gene_info <- select(org.Hs.eg.db, gencode, c(
    "ENTREZID", "GENENAME", "SYMBOL",
    "ENSEMBL"
), "ENSEMBL")

## Explore part of the results
dim(gene_info)
head(gene_info)

## ----'define_ers', eval = .Platform$OS.type != 'windows'--------------------------------------------------------------
## Define expressed regions for study SRP009615, only for chromosome Y
regions <- expressed_regions("SRP009615", "chrY",
    cutoff = 5L,
    maxClusterGap = 3000L
)

## Briefly explore the resulting regions
regions

## ----'compute_covMat', eval = .Platform$OS.type != 'windows'----------------------------------------------------------
## Compute coverage matrix for study SRP009615, only for chromosome Y
system.time(rse_ER <- coverage_matrix("SRP009615", "chrY", regions))

## Explore the RSE a bit
dim(rse_ER)
rse_ER

## ----'to_integer', eval = .Platform$OS.type != 'windows'--------------------------------------------------------------
## Round the coverage matrix to integers
covMat <- round(assays(rse_ER)$counts, 0)

## ----'phenoData', eval = .Platform$OS.type != 'windows'---------------------------------------------------------------
## Get phenotype data for study SRP009615
pheno <- colData(rse_ER)

## Complete the phenotype table with the data we got from GEO
m <- match(pheno$run, sample_info$run)
pheno <- cbind(pheno, sample_info[m, 2:3])

## Explore the phenotype data a little bit
head(pheno)

## ----'ers_dds', eval = .Platform$OS.type != 'windows'-----------------------------------------------------------------
## Build a DESeqDataSet
dds_ers <- DESeqDataSetFromMatrix(
    countData = covMat, colData = pheno,
    design = ~ gene_target + group
)

## ----'de_analysis_ers', eval = .Platform$OS.type != 'windows'---------------------------------------------------------
## Perform differential expression analysis with DESeq2 at the ER-level
dds_ers <- DESeq(dds_ers,
    test = "LRT", reduced = ~gene_target,
    fitType = "local"
)
res_ers <- results(dds_ers)

## ----'maploters', eval = .Platform$OS.type != 'windows', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using ER-level data (just chrY)."----
## Explore results
plotMA(res_ers, main = "DESeq2 results for SRP009615 (ER-level, chrY)")

## ----'report2', eval = FALSE------------------------------------------------------------------------------------------
#  ## Create the report
#  report2 <- DESeq2Report(dds_ers,
#      res = res_ers,
#      project = "SRP009615 (ER-level, chrY)",
#      intgroup = c("group", "gene_target"), outdir = ".",
#      output = "SRP009615-results-ER-level-chrY"
#  )

## ---- 'gene_annotation'-----------------------------------------------------------------------------------------------
## Gene annotation information
rowRanges(rse_gene_SRP009615)

## Also accessible via
rowData(rse_gene_SRP009615)

## ---- 'exon_annotation'-----------------------------------------------------------------------------------------------
## Get the rse_exon object for study SRP009615
download_study("SRP009615", type = "rse-exon")
load(file.path("SRP009615", "rse_exon.Rdata"))

## Delete it if you don't need it anymore
unlink("SRP009615", recursive = TRUE)

## Annotation information
rowRanges(rse_exon)

## Add a gene_id column
rowRanges(rse_exon)$gene_id <- rownames(rse_exon)

## Example subset
rse_exon_subset <- subset(rse_exon, subset = gene_id == "ENSG00000000003.14")
rowRanges(rse_exon_subset)

## ---- eval = .Platform$OS.type != 'windows'---------------------------------------------------------------------------
## Get the disjoint exons based on EnsDb.Hsapiens.v79 which matches hg38
exons <- reproduce_ranges("exon", db = "EnsDb.Hsapiens.v79")

## Change the chromosome names to match those used in the BigWig files
library("GenomeInfoDb")
seqlevelsStyle(exons) <- "UCSC"

## Get the count matrix at the exon level for chrY
exons_chrY <- keepSeqlevels(exons, "chrY", pruning.mode = "coarse")
exonRSE <- coverage_matrix("SRP009615", "chrY", unlist(exons_chrY),
    chunksize = 3000
)
exonMatrix <- assays(exonRSE)$counts
dim(exonMatrix)
head(exonMatrix)

## Summary the information at the gene level for chrY
exon_gene <- rep(names(exons_chrY), elementNROWS(exons_chrY))
geneMatrix <- do.call(rbind, lapply(split(
    as.data.frame(exonMatrix),
    exon_gene
), colSums))
dim(geneMatrix)
head(geneMatrix)

## ----'gene_fusions'---------------------------------------------------------------------------------------------------
library("recount")

## Download and load RSE object for SRP009615 study
download_study("SRP009615", type = "rse-jx")
load(file.path("SRP009615", "rse_jx.Rdata"))

## Delete it if you don't need it anymore
unlink("SRP009615", recursive = TRUE)

## Exon-exon junctions by class
table(rowRanges(rse_jx)$class)

## Potential gene fusions by chromosome
fusions <- rowRanges(rse_jx)[rowRanges(rse_jx)$class == "fusion"]
fusions_by_chr <- sort(table(seqnames(fusions)), decreasing = TRUE)
fusions_by_chr[fusions_by_chr > 0]

## Genes with the most fusions
head(sort(table(unlist(fusions$symbol_proposed)), decreasing = TRUE))

## ----snaptron---------------------------------------------------------------------------------------------------------
library("GenomicRanges")
junctions <- GRanges(seqnames = "chr2", IRanges(
    start = c(28971711, 29555082, 29754983),
    end = c(29462418, 29923339, 29917715)
))

snaptron_query(junctions)

## ----'rse-fc example'-------------------------------------------------------------------------------------------------
## Example use of download_study()
## for downloading the FANTOM-CAT `rse_fc`
## file for a given study
download_study("DRP000366", type = "rse-fc", download = FALSE)

## ----'recount-brain'--------------------------------------------------------------------------------------------------
## recount-brain citation details
citation("recount")[5]

## ----'downloadAll'----------------------------------------------------------------------------------------------------
## Get data.frame with all the URLs
library("recount")
dim(recount_url)

## Explore URLs
head(recount_url$url)

## ----'actuallyDownload', eval = FALSE---------------------------------------------------------------------------------
#  ## Download all files (will take a while! It's over 8 TB of data)
#  sapply(unique(recount_url$project), download_study, type = "all")

## ----Figure1, out.width="100%", fig.align="center", fig.cap = "SIgn up to SciServer", echo = FALSE--------------------
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/register.png")

## ----Figure2, out.width="100%", fig.align="center", fig.cap = "Click on SciServer Compute,", echo = FALSE-------------
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/sciserver-compute.png")

## ----Figure3, out.width="100%", fig.align="center", fig.cap = "Select the appropriate image and load the recount public volume.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/new-container.png")

## ----Figure4, out.width="100%", fig.align="center", fig.cap = "Create a R notebook.", echo = FALSE--------------------
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/new-R.png")

## ----Figure5, out.width="100%", fig.align="center", fig.cap = "Install recount and DESeq2 on SciServer.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/demo1.png")

## ----Figure6, out.width="100%", fig.align="center", fig.cap = "recount files are available locally so remmeber to use the local data when working on SciServer.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/demo2.png")

## ----'ER-sciserver', eval = FALSE-------------------------------------------------------------------------------------
#  ## Expressed-regions SciServer example
#  regions <- expressed_regions("SRP009615", "chrY",
#      cutoff = 5L,
#      maxClusterGap = 3000L, outdir = file.path(scipath, "SRP009615")
#  )
#  regions

## ----createVignette, eval=FALSE---------------------------------------------------------------------------------------
#  ## Create the vignette
#  library("rmarkdown")
#  system.time(render("recount-quickstart.Rmd", "BiocStyle::html_document"))
#  
#  ## Extract the R code
#  library("knitr")
#  knit("recount-quickstart.Rmd", tangle = TRUE)

## ----reproduce1, echo=FALSE-------------------------------------------------------------------------------------------
## Date the vignette was generated
Sys.time()

## ----reproduce2, echo=FALSE-------------------------------------------------------------------------------------------
## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits = 3)

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()

## ----'datasetup', echo = FALSE, eval = FALSE--------------------------------------------------------------------------
#  ## Code for re-creating the data distributed in this package
#  
#  ## Genes/exons
#  library("recount")
#  recount_both <- reproduce_ranges("both", db = "Gencode.v25")
#  recount_exons <- recount_both$exon
#  save(recount_exons, file = "../data/recount_exons.RData", compress = "xz")
#  recount_genes <- recount_both$gene
#  save(recount_genes, file = "../data/recount_genes.RData", compress = "xz")
#  
#  ## URL table
#  load("../../recount-website/fileinfo/upload_table.Rdata")
#  recount_url <- upload_table
#  ## Create URLs
#  is.bw <- grepl("[.]bw$", recount_url$file_name)
#  recount_url$url <- NA
#  recount_url$url[!is.bw] <- paste0(
#      "http://duffel.rail.bio/recount/",
#      recount_url$project[!is.bw], "/", recount_url$file_name[!is.bw]
#  )
#  recount_url$url[!is.bw & recount_url$version2] <-
#      paste0(
#          "http://duffel.rail.bio/recount/v2/",
#          recount_url$project[!is.bw & recount_url$version2], "/",
#          recount_url$file_name[!is.bw & recount_url$version2]
#      )
#  recount_url$url[is.bw] <- paste0(
#      "http://duffel.rail.bio/recount/",
#      recount_url$project[is.bw], "/bw/", recount_url$file_name[is.bw]
#  )
#  save(recount_url, file = "../data/recount_url.RData", compress = "xz")
#  
#  ## Add FC-RC2 data
#  load("../data/recount_url.RData")
#  load("../../recount-website/fileinfo/fc_rc_url_table.Rdata")
#  recount_url <- rbind(recount_url, fc_rc_url_table)
#  rownames(recount_url) <- NULL
#  save(recount_url, file = "../data/recount_url.RData", compress = "xz")
#  
#  
#  ## Abstract info
#  load("../../recount-website/website/meta_web.Rdata")
#  recount_abstract <- meta_web[, c("number_samples", "species", "abstract")]
#  recount_abstract$project <- gsub('.*">|</a>', "", meta_web$accession)
#  Encoding(recount_abstract$abstract) <- "latin1"
#  recount_abstract$abstract <- iconv(recount_abstract$abstract, "latin1", "UTF-8")
#  save(recount_abstract,
#      file = "../data/recount_abstract.RData",
#      compress = "bzip2"
#  )
#  
#  ## Example rse_gene file
#  system("scp e:/dcl01/leek/data/recount-website/rse/rse_sra/SRP009615/rse_gene.Rdata .")
#  load("rse_gene.Rdata")
#  rse_gene_SRP009615 <- rse_gene
#  save(rse_gene_SRP009615,
#      file = "../data/rse_gene_SRP009615.RData",
#      compress = "xz"
#  )
#  unlink("rse_gene.Rdata")

## ----'cleanup', echo = FALSE, results = 'hide'------------------------------------------------------------------------
## Clean up
unlink("SRP009615", recursive = TRUE)

## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))