## ----echo = FALSE, message = FALSE, warning = FALSE---------------------------
library(BiocStyle)
library(cowplot)

## ----load-libs, message = FALSE,  warning = FALSE-----------------------------
library(dplyr)
library(ggplot2)
library(muscat)
library(purrr)

## ----eh, cache = TRUE, message = FALSE----------------------------------------
library(ExperimentHub)
eh <- ExperimentHub()
query(eh, "Kang")

## ----load-data, message = FALSE-----------------------------------------------
(sce <- eh[["EH2259"]])

## ----fil----------------------------------------------------------------------
# remove undetected genes
sce <- sce[rowSums(counts(sce) > 0) > 0, ]
dim(sce)

## ----qc, message = FALSE------------------------------------------------------
# calculate per-cell quality control (QC) metrics
library(scater)
qc <- perCellQCMetrics(sce)

# remove cells with few or many detected genes
ol <- isOutlier(metric = qc$detected, nmads = 2, log = TRUE)
sce <- sce[, !ol]
dim(sce)

# remove lowly expressed genes
sce <- sce[rowSums(counts(sce) > 1) >= 10, ]
dim(sce)

## ----norm---------------------------------------------------------------------
# compute sum-factors & normalize
sce <- computeLibraryFactors(sce)
sce <- logNormCounts(sce)

## ----vst, eval = FALSE--------------------------------------------------------
#  library(sctransform)
#  assays(sce)$vstresiduals <- vst(counts(sce), show_progress = FALSE)$y

## ----prep-sce-----------------------------------------------------------------
sce$id <- paste0(sce$stim, sce$ind)
(sce <- prepSCE(sce, 
    cluster_id = "cell", # subpopulation assignments
    group_id = "stim",   # group IDs (ctrl/stim)
    sample_id = "id",    # sample IDs (ctrl/stim.1234)
    drop = TRUE))        # drop all other colData columns

## ----ids----------------------------------------------------------------------
nk <- length(kids <- levels(sce$cluster_id))
ns <- length(sids <- levels(sce$sample_id))
names(kids) <- kids; names(sids) <- sids

## ----ncells, size = "small"---------------------------------------------------
# nb. of cells per cluster-sample
t(table(sce$cluster_id, sce$sample_id))

## ----umap---------------------------------------------------------------------
# compute UMAP using 1st 20 PCs
sce <- runUMAP(sce, pca = 20)

## -----------------------------------------------------------------------------
# wrapper to prettify reduced dimension plots
.plot_dr <- function(sce, dr, col)
  plotReducedDim(sce, dimred = dr, colour_by = col) +
    guides(fill = guide_legend(override.aes = list(alpha = 1, size = 3))) +
    theme_minimal() + theme(aspect.ratio = 1)

## ----eval = FALSE-------------------------------------------------------------
#  # downsample to max. 100 cells per cluster
#  cs_by_k <- split(colnames(sce), sce$cluster_id)
#  cs100 <- unlist(sapply(cs_by_k, function(u)
#    sample(u, min(length(u), 100))))
#  
#  # plot t-SNE & UMAP colored by cluster & group ID
#  for (dr in c("TSNE", "UMAP"))
#    for (col in c("cluster_id", "group_id"))
#      .plot_dr(sce[, cs100], dr, col)

## ----dr-ids, echo = FALSE, results = "asis", fig.height = 4, fig.width = 12, fig.cap = "Dimension reduction plots. Cells are colored by cluster ID (A) and group ID (B), respectively. For each cluster, at most 100 cells were sampled for plotting."----
cs_by_k <- split(colnames(sce), sce$cluster_id)
cs100 <- unlist(sapply(cs_by_k, function(u) 
  sample(u, min(length(u), 100))))

for (dr in c("TSNE", "UMAP")) {
  cat("#### ", dr, "{-}\n")
  ps <- lapply(c("cluster_id", "group_id"), 
    function(col) .plot_dr(sce[, cs100], dr, col = col))
  assign(paste0("ps_", tolower(dr)), ps)
  print(plot_grid(plotlist = ps, align = "vh", labels = c("A", "B")))
  cat("\n\n")
}

## ----echo = FALSE, out.height = 4, fig.cap = "Schematic overview of cell- and sample-level approaches for DS analysis. Top panels show a schematic of the data distributions or aggregates across samples (each violin is a group or sample; each dot is a sample) and conditions (blue or orange). The bottom panels highlight the data organization in sub-matrix slices of the original count table."----
knitr::include_graphics(system.file('extdata', 'ds_approaches.png', package = 'muscat'))

## ----agg----------------------------------------------------------------------
pb <- aggregateData(sce,
    assay = "counts", fun = "sum",
    by = c("cluster_id", "sample_id"))
# one sheet per subpopulation
assayNames(pb)
# pseudobulks for 1st subpopulation
t(head(assay(pb)))

## ----pb-mds, fig.height = 4, fig.cap = "Pseudobulk-level multidimensional scaling (MDS) plot. Each point represents a cluster-sample instance; points are colored by cluster ID and shaped by group ID."----
(pb_mds <- pbMDS(pb))

## ----message = FALSE, fig.height = 4, fig.cap = "Pseudobulk-level MDS plot v2. Default plotting aesthetics were modified to change shaping of groups, coloring of clusters, as well as point size and transparency."----
# use very distinctive shaping of groups & change cluster colors
pb_mds <- pb_mds + 
  scale_shape_manual(values = c(17, 4)) +
  scale_color_manual(values = RColorBrewer::brewer.pal(8, "Set2"))
# change point size & alpha
pb_mds$layers[[1]]$aes_params$size <- 5
pb_mds$layers[[1]]$aes_params$alpha <- 0.6
pb_mds

## -----------------------------------------------------------------------------
# run DS analysis
res <- pbDS(pb, verbose = FALSE)
# access results table for 1st comparison
tbl <- res$table[[1]]
# one data.frame per cluster
names(tbl)
# view results for 1st cluster
k1 <- tbl[[1]]
head(format(k1[, -ncol(k1)], digits = 2))

## ----eval = FALSE-------------------------------------------------------------
#  # construct design & contrast matrix
#  ei <- metadata(sce)$experiment_info
#  mm <- model.matrix(~ 0 + ei$group_id)
#  dimnames(mm) <- list(ei$sample_id, levels(ei$group_id))
#  contrast <- makeContrasts("stim-ctrl", levels = mm)
#  
#  # run DS analysis
#  pbDS(pb, design = mm, contrast = contrast)

## ----mm, eval = FALSE---------------------------------------------------------
#  # 1st approach
#  mm <- mmDS(sce, method = "dream",
#    n_cells = 10, n_samples = 2,
#    min_counts = 1, min_cells = 20)
#  
#  # 2nd & 3rd approach
#  mm <- mmDS(sce, method = "vst", vst = "sctransform")
#  mm <- mmDS(sce, method = "nbinom")

## -----------------------------------------------------------------------------
# filter FDR < 5%, abs(logFC) > 1 & sort by adj. p-value
tbl_fil <- lapply(tbl, function(u) {
  u <- dplyr::filter(u, p_adj.loc < 0.05, abs(logFC) > 1)
  dplyr::arrange(u, p_adj.loc)
})

# nb. of DS genes & % of total by cluster
n_de <- vapply(tbl_fil, nrow, numeric(1))
p_de <- format(n_de / nrow(sce) * 100, digits = 3)
data.frame("#DS" = n_de, "%DS" = p_de, check.names = FALSE)

# view top 2 hits in each cluster
top2 <- bind_rows(lapply(tbl_fil, top_n, 2, p_adj.loc))
format(top2[, -ncol(top2)], digits = 2)

## ----frq----------------------------------------------------------------------
frq <- calcExprFreqs(sce, assay = "counts", th = 0)
# one sheet per cluster
assayNames(frq)
# expression frequencies in each
# sample & group; 1st cluster
t(head(assay(frq), 5))

## -----------------------------------------------------------------------------
gids <- levels(sce$group_id)
frq10 <- vapply(as.list(assays(frq)), 
  function(u) apply(u[, gids] > 0.1, 1, any), 
  logical(nrow(sce)))
t(head(frq10))

tbl_fil2 <- lapply(kids, function(k)
  dplyr::filter(tbl_fil[[k]], 
    gene %in% names(which(frq10[, k]))))

# nb. of DS genes & % of total by cluster
n_de <- vapply(tbl_fil2, nrow, numeric(1))
p_de <- format(n_de / nrow(sce) * 100, digits = 3)
data.frame("#DS" = n_de, "%DS" = p_de, check.names = FALSE)

## ----eval = FALSE-------------------------------------------------------------
#  # tidy format; attach pre-computed expression frequencies
#  resDS(sce, res, bind = "row", frq = frq)
#  
#  # big-table (wide) format; attach CPMs
#  resDS(sce, res, bind = "col", cpm = TRUE)

## ----eval = FALSE-------------------------------------------------------------
#  # compute expression frequencies on the fly
#  resDS(sce, res, frq = TRUE)

## ----upset, fig.width = 10, fig.cap = "Upset plot. Included are DS findings (FDR < 0.05, |logFC| > 1) across all clusters; shown are the 50 most frequent interactions."----
library(UpSetR)
de_gs_by_k <- map(tbl_fil, "gene")
upset(fromList(de_gs_by_k))

## ----fig.width = 14, fig.height = 8, fig.cap = "t-SNE colored by gene expression. Show are t-SNE projections with cells colored by the expression of the top-8 DS genes. For each cluster, at most 100 cells were sampled for plotting."----
# pull top-8 DS genes across all clusters
top8 <- bind_rows(tbl_fil) %>% 
  top_n(8, dplyr::desc(p_adj.loc)) %>% 
  pull("gene")

# for ea. gene in 'top8', plot t-SNE colored by its expression 
ps <- lapply(top8, function(g)
  .plot_dr(sce[, cs100], "TSNE", g) + 
    ggtitle(g) + theme(legend.position = "none"))

# arrange plots
plot_grid(plotlist = ps, ncol = 4, align = "vh")

## ----violins, fig.width = 10, fig.height = 5, fig.cap = "Violin plots. Show are the top 6 hits (lowest adj. p-value) for the B cells cluster. Each violin is a sample; points are colored by group ID."----
plotExpression(sce[, sce$cluster_id == "B cells"],
  features = tbl_fil$`B cells`$gene[seq_len(6)],
  x = "sample_id", colour_by = "group_id", ncol = 3) +
  guides(fill = guide_legend(override.aes = list(size = 5, alpha = 1))) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## ----pb-hm-1------------------------------------------------------------------
# top-5 DS genes per cluster
pbHeatmap(sce, res, top_n = 5)

## ----pb-hm-2------------------------------------------------------------------
# top-20 DS genes for single cluster
pbHeatmap(sce, res, k = "B cells")

## ----pb-hm-3------------------------------------------------------------------
# single gene across all clusters
pbHeatmap(sce, res, g = "ISG20")

## -----------------------------------------------------------------------------
sessionInfo()