## ----init, include=FALSE, echo=FALSE, message=FALSE, warning=FALSE------------ knitr::opts_chunk$set( echo=TRUE, warning=FALSE, message=FALSE, error=FALSE) #, dpi=150) # TODO: # 1. update text to favor overrepresentation analysis method `ora` # 2. describe how its data.frame input works (ora) # 3. provide a brief comparison to its performance vs goseq, with and without # accounting for bias). You can take the code for that from the # enrichtest/goseq PAC unit test. ## ----sparrow-methods, echo=FALSE, message=FALSE, warning=FALSE---------------- dplyr::select(sparrow::sparrow_methods(), method, test_type, package) ## ----init-env, warning=FALSE, message=FALSE----------------------------------- library(sparrow) library(magrittr) library(dplyr) library(ggplot2) library(ComplexHeatmap) library(circlize) library(edgeR) library(data.table) theme_set(theme_bw()) ## ----data-setup, eval=!exists('y.all'), results='hide'------------------------ vm <- exampleExpressionSet(dataset = "tumor-subtype", do.voom = TRUE) ## ----------------------------------------------------------------------------- vm$targets %>% select(Patient_ID, Cancer_Status, PAM50subtype) ## ----------------------------------------------------------------------------- vm$design ## ----contrast-setup----------------------------------------------------------- (cm <- makeContrasts(BvH=Basal - Her2, levels=vm$design)) ## ----dge-analysis------------------------------------------------------------- fit <- lmFit(vm, vm$design) %>% contrasts.fit(cm) %>% eBayes tt <- topTable(fit, 'BvH', n=Inf, sort.by='none') ## ----build-gdb---------------------------------------------------------------- gdb <- getMSigGeneSetDb(c("h", "c2"), "human", id.type = "entrez") ## ----geneSets-accessor-------------------------------------------------------- geneSets(gdb) %>% head %>% select(1:4) ## ----run-multi-GSEA, results='hide', warning=FALSE---------------------------- mg <- seas( vm, gdb, c('camera', 'fry', 'ora'), design = vm$design, contrast = cm[, 'BvH'], # these parameters define which genes are differentially expressed feature.max.padj = 0.05, feature.min.logFC = 1, # for camera: inter.gene.cor = 0.01, # specifies the numeric covariate to bias-correct for # "size" is found in the vm$genes data.frame, which makes its way to the # internal DGE statistics table ... more on that later feature.bias = "size") ## ----logFC-results------------------------------------------------------------ lfc <- logFC(mg) lfc %>% select(symbol, entrez_id, logFC, t, pval, padj) %>% head ## ----compare-dge-t-stats------------------------------------------------------ comp <- tt %>% select(entrez_id, logFC, t, pval=P.Value, padj=adj.P.Val) %>% inner_join(lfc, by='entrez_id', suffix=c('.tt', '.mg')) all.equal(comp$t.tt, comp$t.mg) ## ----mg-res------------------------------------------------------------------- mg ## ----resultnames-------------------------------------------------------------- resultNames(mg) ## ----gsea-res----------------------------------------------------------------- cam.res <- result(mg, 'camera') cam.go.res <- results(mg, c('camera', 'ora.up')) ## ----camera-summary----------------------------------------------------------- cam.res %>% filter(padj < 0.1, collection == 'H') %>% arrange(desc(mean.logFC)) %>% select(name, n, mean.logFC, padj) %>% head ## ----geneset-result----------------------------------------------------------- geneSet(mg, name = 'HALLMARK_WNT_BETA_CATENIN_SIGNALING') %>% select(symbol, entrez_id, logFC, pval, padj) %>% head() ## ----iplot-wnt-beta, fig.asp=1, eval=FALSE------------------------------------ # iplot(mg, 'HALLMARK_WNT_BETA_CATENIN_SIGNALING', # type = 'boxplot', value = 'logFC') ## ----iplot-wnt-beta-density, fig.asp=1, eval=FALSE---------------------------- # iplot(mg, 'HALLMARK_WNT_BETA_CATENIN_SIGNALING', # type = 'density', value = 'logFC') ## ----iplot-wnt-beta-gsea, fig.asp=1, eval=FALSE, message=FALSE, warning=FALSE---- # iplot(mg, 'HALLMARK_WNT_BETA_CATENIN_SIGNALING', # type = 'gsea', value = 'logFC') ## ----ssgenesets--------------------------------------------------------------- sig.res <- cam.res %>% filter(padj < 0.05 & (grepl("HALLMARK", name) | abs(mean.logFC) >= 2)) gdb.sub <- gdb[geneSets(gdb)$name %in% sig.res$name] ## ----subset-exprs------------------------------------------------------------- vm.bh <- vm[, vm$targets$PAM50subtype %in% c("Basal", "Her2")] ## ----ssscore, warning=FALSE--------------------------------------------------- scores <- scoreSingleSamples(gdb.sub, vm.bh, methods = c('ewm', 'ssgsea', 'zscore'), ssgsea.norm = TRUE, unscale=FALSE, uncenter=FALSE, as.dt = TRUE) ## ----sss-pairs, warning=FALSE------------------------------------------------- # We miss you, reshape2::acast sw <- dcast(scores, name + sample_id ~ method, value.var="score") corplot(sw[, -(1:2), with = FALSE], cluster=TRUE) ## ---- warning=FALSE----------------------------------------------------------- ewmu <- scoreSingleSamples(gdb.sub, vm.bh,methods = "ewm", unscale = TRUE, uncenter = TRUE, as.dt = TRUE) ewmu[, method := "ewm_unscale"] scores.all <- rbind(scores, ewmu) swa <- dcast(scores.all, name + sample_id ~ method, value.var="score") corplot(swa[, -(1:2), with = FALSE], cluster=TRUE) ## ----anno-scores-------------------------------------------------------------- all.scores <- scores.all %>% inner_join(select(vm.bh$targets, sample_id=Sample_ID, subtype=PAM50subtype), by = "sample_id") some.scores <- all.scores %>% filter(name %in% head(unique(all.scores$name), 5)) ggplot(some.scores, aes(subtype, score)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width=0.25) + facet_grid(name ~ method) ## ----gheatmap, fig.height=8, fig.width=4-------------------------------------- gs.sub <- geneSets(gdb.sub) gdb.2 <- gdb.sub[geneSets(gdb.sub)$name %in% head(gs.sub$name, 2)] col.anno <- HeatmapAnnotation( df = vm.bh$targets[, 'PAM50subtype', drop = FALSE], col = list(PAM50subtype = c(Basal = "gray", Her2 = "black"))) mgheatmap(vm.bh, gdb.2, aggregate.by = "none", split = TRUE, show_row_names = FALSE, show_column_names = FALSE, recenter = TRUE, top_annotation = col.anno, zlim = c(-3, 3)) ## ----gshm2, fig.height = 2.5, fig.width = 8----------------------------------- mgheatmap(vm.bh, gdb.2, aggregate.by = "ewm", split = FALSE, show_row_names = TRUE, show_column_names = FALSE, top_annotation = col.anno) ## ----gshm-all, fig.height = 6, fig.width = 8---------------------------------- mgheatmap(vm.bh, gdb.sub, aggregate.by = 'ewm', split=TRUE, recenter = TRUE, show_row_names=TRUE, show_column_names=FALSE, top_annotation=col.anno, zlim = c(-2.5, 2.5)) ## ----------------------------------------------------------------------------- as.data.frame(gdb)[c(1:5, 201:205),] ## ---- eval=FALSE-------------------------------------------------------------- # msigdb <- getMSigGeneSetDb('H', 'human') # goslimdb <- getPantherGeneSetDb('goslim', 'human') # gdb.uber <- combine(msigdb, goslimdb) ## ----subset-gdb-by-metadata--------------------------------------------------- keep <- geneSets(gdb)$subcategory == "CP:PID" gdb.sub <- gdb[keep] geneSets(gdb.sub) %>% head() ## ----subset-gdb--------------------------------------------------------------- gdb.sub2 <- subsetByFeatures(gdb, c('10014', '1454')) nrow(gdb); nrow(gdb.sub2) ## ----------------------------------------------------------------------------- gdbc <- conform(gdb, vm, min.gs.size=10, max.gs.size=100) head(geneSets(gdbc, active.only=FALSE)) ## ----------------------------------------------------------------------------- missed <- setdiff( featureIds(gdbc, 'C2', 'ABBUD_LIF_SIGNALING_1_DN', active.only=FALSE), featureIds(gdbc, 'C2', 'ABBUD_LIF_SIGNALING_1_DN', active.only=TRUE)) missed ## ----------------------------------------------------------------------------- gdbc %>% geneSet('C2', 'ABBUD_LIF_SIGNALING_1_DN', active.only = FALSE) %>% subset(feature_id %in% missed) ## ----------------------------------------------------------------------------- vm <- exampleExpressionSet() vms <- renameRows(vm, "symbol") head(cbind(rownames(vm), rownames(vms))) ## ---- eval=FALSE-------------------------------------------------------------- # mg <- seas(vm, gdb, "goseq", design = vm$design, cm[, 'BvH'], # treat.lfc=log2(1.5), # ## feature length vector required for goseq # feature.bias=setNames(vm$genes$size, rownames(vm))) ## ---- eval=FALSE-------------------------------------------------------------- # mgx <- seas(vm, gdb, c('camera', 'roast'), # design = vm$design, contrast = cm[, 'BvH'], # inter.gene.cor = 0.04, nrot = 500) ## ----session-info------------------------------------------------------------- sessionInfo()