## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE----------------
## For links
library("BiocStyle")

## Track time spent on making the vignette
startTime <- Sys.time()

# Bib setup
library("RefManageR")

## Write bibliography information
bib <- c(
    R = citation(),
    BiocFileCache = citation("BiocFileCache")[1],
    BiocStyle = citation("BiocStyle")[1],
    dplyr = citation("dplyr")[1],
    ggplot2 = citation("ggplot2")[1],
    knitr = citation("knitr")[1],
    Matrix = citation("Matrix")[1],
    pheatmap = citation("pheatmap")[1],
    purrr = citation("purrr")[1],
    rafalib = citation("rafalib")[1],
    RefManageR = citation("RefManageR")[1],
    rmarkdown = citation("rmarkdown")[1],
    sessioninfo = citation("sessioninfo")[1],
    SummarizedExperiment = citation("SummarizedExperiment")[1],
    testthat = citation("testthat")[1],
    tibble = citation("tibble")[1],
    tidyr = citation("tidyr")[1],
    tran2021 = RefManageR::BibEntry(
        bibtype = "Article",
        key = "tran2021",
        author = "Tran, Matthew N. and Maynard, Kristen R. and Spangler, Abby and Huuki, Louise A. and Montgomery, Kelsey D. and Sadashivaiah, Vijay and Tippani, Madhavi and Barry, Brianna K. and Hancock, Dana B. and Hicks, Stephanie C. and Kleinman, Joel E. and Hyde, Thomas M. and Collado-Torres, Leonardo and Jaffe, Andrew E. and Martinowich, Keri",
        title = "Single-nucleus transcriptome analysis reveals cell-type-specific molecular signatures across reward circuitry in the human brain",
        year = 2021, doi = "10.1016/j.neuron.2021.09.001",
        journal = "Neuron"
    ),
    TREG = citation("TREG")[1],
    TREGpaper = citation("TREG")[2]
)

## ----'install', eval = FALSE--------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("TREG")
#  
#  ## Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## ----'citation'---------------------------------------------------------------
## Citation info
citation("TREG")

## ----"load_TREG", message = FALSE, warning = FALSE----------------------------
library("TREG")

## ----"load_packages", message=FALSE, warning=FALSE----------------------------
library("SingleCellExperiment")
library("pheatmap")
library("dplyr")
library("ggplot2")
library("tidyr")
library("tibble")

## ----"download_DLPFC_data"----------------------------------------------------
# Download and save a local cache of the data available at:
# https://github.com/LieberInstitute/10xPilot_snRNAseq-human#processed-data
bfc <- BiocFileCache::BiocFileCache()
url <- paste0(
    "https://libd-snrnaseq-pilot.s3.us-east-2.amazonaws.com/",
    "SCE_DLPFC-n3_tran-etal.rda"
)
local_data <- BiocFileCache::bfcrpath(url, x = bfc)

load(local_data, verbose = TRUE)

## ----"Data Size Check", echo=FALSE--------------------------------------------
## Using 1.01 GB
# lobstr::obj_size(sce.dlpfc.tran) 

## ----"acronyms", echo=FALSE---------------------------------------------------

ct_names <- tibble(
    "Cell Type" = c(
        "Astrocyte",
        "Excitatory Neurons",
        "Microglia",
        "Oligodendrocytes",
        "Oligodendrocyte Progenitor Cells",
        "Inhibitory Neurons"
    ),
    "Acronym" = c(
        "Astro",
        "Excit",
        "Micro",
        "Oligo",
        "OPC",
        "Inhib"
    )
)

knitr::kable(ct_names, caption = "Cell type names and corresponding acronyms used in this dataset", label = "acronyms")

## ----"define_cell_types"------------------------------------------------------
## Explore the dimensions and cell type annotations
dim(sce.dlpfc.tran)
table(sce.dlpfc.tran$cellType)

## Use a lower resolution of cell type annotation
sce.dlpfc.tran$cellType.broad <- gsub("_[A-Z]$", "", sce.dlpfc.tran$cellType)
(cell_type_tab <- table(sce.dlpfc.tran$cellType.broad))

## ----"drop_small_cell_types"--------------------------------------------------
## Find cell types with < 50 cells
(ct_drop <- names(cell_type_tab)[cell_type_tab < 50])

## Filter columns of sce object
sce.dlpfc.tran <- sce.dlpfc.tran[, !sce.dlpfc.tran$cellType.broad %in% ct_drop]

## Check new cell type bread down and dimension
table(sce.dlpfc.tran$cellType.broad)

dim(sce.dlpfc.tran)

## ----"examineSparsity", fig.cap = "Heatmap of the snRNA-seq counts. Illustrates sparseness of unfiltered data."----
## this data is 88% sparse
sum(assays(sce.dlpfc.tran)$counts == 0) / (nrow(sce.dlpfc.tran) * ncol(sce.dlpfc.tran))

## lets make a heatmap of the first 1k genes and 500 cells
count_test <- as.matrix(assays(sce.dlpfc.tran)$logcounts[seq_len(1000), seq_len(500)])
pheatmap(count_test,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    show_colnames = FALSE
)

## ----"top50_filter"-----------------------------------------------------------
row_means <- rowMeans(assays(sce.dlpfc.tran)$logcounts)
(median_row_means <- median(row_means))

sce.dlpfc.tran <- sce.dlpfc.tran[row_means > median_row_means, ]
dim(sce.dlpfc.tran)

## ----"top50FilterHeatmap", fig.cap = "Heatmap of the snRNA-seq counts. With top 50% filtering the data becomes less sparse."----
## this data down to 77% sparse
sum(assays(sce.dlpfc.tran)$counts == 0) / (nrow(sce.dlpfc.tran) * ncol(sce.dlpfc.tran))

## replot heatmap
count_test <- as.matrix(assays(sce.dlpfc.tran)$logcounts[seq_len(1000), seq_len(500)])
pheatmap::pheatmap(count_test,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    show_colnames = FALSE
)

## ----"get_prop_zero"----------------------------------------------------------
# get prop zero for each gene for each cell type
prop_zeros <- get_prop_zero(sce.dlpfc.tran, group_col = "cellType.broad")
head(prop_zeros)

## ----"propZeroDistribution", fig.cap = "Distribution of Proption Zero across cell types and regions."----
# Pivot data longer for plotting
prop_zero_long <- prop_zeros %>%
    rownames_to_column("Gene") %>%
    pivot_longer(!Gene, names_to = "Group", values_to = "prop_zero")

# Plot histograms
(prop_zero_histogram <- ggplot(
    data = prop_zero_long,
    aes(x = prop_zero, fill = Group)
) +
    geom_histogram(binwidth = 0.05) +
    facet_wrap(~Group))

## ----"pickCutoff", fig.cap = "Show Proportion Zero Cutoff on Distributions."----
## Specify a cutoff, here we use 0.9
propZero_limit <- 0.9

## Add a vertical red dashed line where the cutoff is located
prop_zero_histogram +
    geom_vline(xintercept = propZero_limit, color = "red", linetype = "dashed")

## ----"filter_prop_zero"-------------------------------------------------------
## get a list of filtered genes
filtered_genes <- filter_prop_zero(prop_zeros, cutoff = propZero_limit)

## How many genes pass the filter?
length(filtered_genes)

## What % of genes is this
length(filtered_genes) / nrow(sce.dlpfc.tran)

## Filter the sce object
sce.dlpfc.tran <- sce.dlpfc.tran[filtered_genes, ]

## ----"propZeroFilterHeatmap", fig.cap = "Heatmap of the snRNA-seq counts after Proption Zero filtering."----
## this data down to 50% sparse
sum(assays(sce.dlpfc.tran)$counts == 0) / (nrow(sce.dlpfc.tran) * ncol(sce.dlpfc.tran))

## re-plot heatmap
count_test <- as.matrix(assays(sce.dlpfc.tran)$logcounts[seq_len(1000), seq_len(500)])
pheatmap::pheatmap(count_test,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    show_rownames = FALSE,
    show_colnames = FALSE
)

## ----"rank_invariance_stepwise"-----------------------------------------------
## Get the rank of the gene in each group
group_rank <- rank_group(sce.dlpfc.tran, group_col = "cellType.broad")

## Get the rank of the gene for each cell
cell_rank <- rank_cells(sce.dlpfc.tran, group_col = "cellType.broad")

## Use both rankings to calculate rank_invariance()
rank_invar <- rank_invariance(group_rank, cell_rank)

## The top 5 Canidate TREGs:
head(sort(rank_invar, decreasing = TRUE))

## ----"run_RI_express"---------------------------------------------------------
## rank_invariance_express() runs the previous functions for you
rank_invar2 <- rank_invariance_express(
    sce.dlpfc.tran,
    group_col = "cellType.broad"
)
## Again the top 5 Canidate TREGs:
head(sort(rank_invar2, decreasing = TRUE))

## Check computationally that the results are identical
stopifnot(identical(rank_invar, rank_invar2))

## ----createVignette, eval=FALSE-----------------------------------------------
#  ## Create the vignette
#  library("rmarkdown")
#  system.time(render("finding_Total_RNA_Expression_Genes.Rmd"))
#  
#  ## Extract the R code
#  library("knitr")
#  knit("finding_Total_RNA_Expression_Genes.Rmd", tangle = TRUE)

## ----reproduce1, echo=FALSE---------------------------------------------------
## Date the vignette was generated
Sys.time()

## ----reproduce2, echo=FALSE---------------------------------------------------
## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits = 3)

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()

## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))