## ----setup, include = FALSE--------------------------------------------------- library(fluentGenomics) dir <- system.file("extdata", package="macrophage") library(tximeta) makeLinkedTxome( indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"), source="Gencode", organism="Homo sapiens", release="29", genome="GRCh38", fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz", gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version write=FALSE ) knitr::opts_chunk$set( fig.align = "center" ) ## ----workflow, fig.cap = "(ref:workflow)", fig.align='center', out.width="\\textwidth", echo = FALSE---- knitr::include_graphics("workflow.png") ## ----read-cache, eval = FALSE------------------------------------------------- # library(fluentGenomics) # atac <- readRDS(cache_atac_se()) ## ----dir, eval=FALSE---------------------------------------------------------- # dir <- "/path/to/quant/files" ## ----setdir------------------------------------------------------------------- dir <- system.file("extdata", package="macrophage") ## ----coldata-rna-------------------------------------------------------------- library(dplyr) library(readr) colfile <- file.path(dir, "coldata.csv") coldata <- read_csv(colfile) %>% dplyr::select( names, id = sample_id, line = line_id, condition = condition_name ) %>% dplyr::mutate( files = file.path(dir, "quants", names, "quant.sf.gz"), line = factor(line), condition = relevel(factor(condition), "naive") ) coldata ## ----tximeta-run-------------------------------------------------------------- suppressPackageStartupMessages(library(SummarizedExperiment)) library(tximeta) se <- tximeta(coldata, dropInfReps=TRUE) se ## ----linkedtxome-ex, eval = FALSE--------------------------------------------- # makeLinkedTxome( # indexDir=file.path(dir, "gencode.v29_salmon_0.12.0"), # source="Gencode", # organism="Homo sapiens", # release="29", # genome="GRCh38", # fasta="ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_29/gencode.v29.transcripts.fa.gz", # gtf=file.path(dir, "gencode.v29.annotation.gtf.gz"), # local version # write=FALSE # ) ## ----gse---------------------------------------------------------------------- gse <- summarizeToGene(se) ## ----coldata-atac, eval=FALSE------------------------------------------------- # atac_coldata <- read_tsv("ATAC_sample_metadata.txt.gz") %>% # select( # sample_id, # donor, # condition = condition_name # ) %>% # mutate(condition = relevel(factor(condition), "naive")) ## ----mat-atac, eval=FALSE----------------------------------------------------- # atac_mat <- read_tsv("ATAC_cqn_matrix.txt.gz", # skip = 1, # col_names =c("rownames", atac_coldata[["sample_id"]])) # rownames <- atac_mat[["rownames"]] # atac_mat <- as.matrix(atac_mat[,-1]) # rownames(atac_mat) <- rownames ## ----peaks-atac, eval=FALSE--------------------------------------------------- # library(plyranges) # peaks_df <- read_tsv("ATAC_peak_metadata.txt.gz", # col_types = c("cidciicdc") # ) # # peaks_gr <- peaks_df %>% # as_granges(seqnames = chr) %>% # select(peak_id=gene_id) %>% # set_genome_info(genome = "GRCh38") ## ----atac-se, eval=FALSE------------------------------------------------------ # atac <- SummarizedExperiment(assays = list(cqndata=atac_mat), # rowRanges=peaks_gr, # colData=atac_coldata) ## ----setup-deseq-------------------------------------------------------------- library(DESeq2) dds <- DESeqDataSet(gse, ~line + condition) # filter out lowly expressed genes # at least 10 counts in at least 6 samples keep <- rowSums(counts(dds) >= 10) >= 6 dds <- dds[keep,] ## ----fit-model---------------------------------------------------------------- dds <- DESeq(dds) ## ----results-DFrame----------------------------------------------------------- res <- results(dds, contrast=c("condition","IFNg","naive"), lfcThreshold=1, alpha=0.01) ## ----ma-plot, fig.cap="(ref:maplot)"------------------------------------------ summary(res) DESeq2::plotMA(res, ylim=c(-10,10)) ## ----results-GRanges---------------------------------------------------------- suppressPackageStartupMessages(library(plyranges)) de_genes <- results(dds, contrast=c("condition","IFNg","naive"), lfcThreshold=1, format="GRanges") %>% names_to_column("gene_id") de_genes ## ----de-genes----------------------------------------------------------------- de_genes <- de_genes %>% filter(padj < 0.01) %>% select(gene_id, de_log2FC = log2FoldChange, de_padj = padj) ## ----not-de-genes------------------------------------------------------------- other_genes <- results(dds, contrast=c("condition","IFNg","naive"), lfcThreshold=1, altHypothesis="lessAbs", format="GRanges") %>% filter(padj < 0.01) %>% names_to_column("gene_id") %>% dplyr::select(gene_id, de_log2FC = log2FoldChange, de_padj = padj) ## ----limma, eval = FALSE------------------------------------------------------ # library(limma) # design <- model.matrix(~donor + condition, colData(atac)) # fit <- lmFit(assay(atac), design) # fit <- eBayes(fit) # idx <- which(colnames(fit$coefficients) == "conditionIFNg") # tt <- topTable(fit, coef=idx, sort.by="none", n=nrow(atac)) ## ----peaks-tidy, eval = FALSE------------------------------------------------- # atac_peaks <- rowRanges(atac) %>% # remove_names() %>% # mutate( # da_log2FC = tt$logFC, # da_padj = tt$adj.P.Val # ) %>% # set_genome_info(genome = "hg38") # # seqlevelsStyle(atac_peaks) <- "UCSC" ## ----load-peaks--------------------------------------------------------------- library(fluentGenomics) peaks ## ----filter-peaks------------------------------------------------------------- da_peaks <- peaks %>% filter(da_padj < 0.01) ## ----slice-example------------------------------------------------------------ size <- length(de_genes) slice(other_genes, sample.int(n(), size)) ## ----boot-set-01-------------------------------------------------------------- # set a seed for the results set.seed(2019-08-02) boot_genes <- replicate(10, slice(other_genes, sample.int(n(), size)), simplify = FALSE) ## ----boot-set-02-------------------------------------------------------------- boot_genes <- bind_ranges(boot_genes, .id = "resample") ## ----combine-results---------------------------------------------------------- all_genes <- bind_ranges( de=de_genes, not_de = boot_genes, .id="origin" ) %>% mutate( origin = factor(origin, c("not_de", "de")), resample = ifelse(is.na(resample), 0L, as.integer(resample)) ) all_genes ## ----resize-01---------------------------------------------------------------- all_genes <- all_genes %>% anchor_5p() %>% mutate(width = 1) ## ----resize-02---------------------------------------------------------------- all_genes <- all_genes %>% anchor_center() %>% mutate(width=2*1e4) ## ----olap-join---------------------------------------------------------------- genes_olap_peaks <- all_genes %>% join_overlap_left(da_peaks) genes_olap_peaks ## ----reduce-ex01-------------------------------------------------------------- gene_peak_max_lfc <- genes_olap_peaks %>% group_by(gene_id, origin) %>% reduce_ranges_directed( peak_count = sum(!is.na(da_padj)) / n_distinct(resample), peak_max_lfc = max(abs(da_log2FC)) ) ## ----boxplot, fig.cap = "(ref:boxplot)"--------------------------------------- library(ggplot2) gene_peak_max_lfc %>% filter(peak_count > 0) %>% as.data.frame() %>% ggplot(aes(origin, peak_max_lfc)) + geom_boxplot() ## ----summarize-ex01----------------------------------------------------------- origin_peak_lfc <- genes_olap_peaks %>% group_by(origin) %>% summarize( peak_count = sum(!is.na(da_padj)) / n_distinct(resample), lfc1_peak_count =sum(abs(da_log2FC) > 1, na.rm=TRUE)/ n_distinct(resample), lfc2_peak_count = sum(abs(da_log2FC) > 2, na.rm=TRUE)/ n_distinct(resample) ) origin_peak_lfc ## ----pivot-enrich------------------------------------------------------------- origin_peak_lfc %>% as.data.frame() %>% tidyr::pivot_longer(cols = -origin) %>% tidyr::pivot_wider(names_from = origin, values_from = value) %>% mutate(enrichment = de / not_de) ## ----reduce-summarize--------------------------------------------------------- genes_olap_peaks %>% group_by(gene_id, origin, resample) %>% reduce_ranges_directed( lfc1 = sum(abs(da_log2FC) > 1, na.rm=TRUE), lfc2 = sum(abs(da_log2FC) > 2, na.rm=TRUE) ) %>% group_by(origin) %>% summarize( lfc1_gene_count = sum(lfc1 > 0) / n_distinct(resample), lfc1_peak_count = sum(lfc1) / n_distinct(resample), lfc2_gene_count = sum(lfc2 > 0) / n_distinct(resample), lfc2_peak_count = sum(lfc2) / n_distinct(resample) ) ## ----count-fn----------------------------------------------------------------- count_if_above_threshold <- function(var, thresholds) { lapply(thresholds, function(.) sum(abs(var) > ., na.rm = TRUE)) } ## ----thresholds--------------------------------------------------------------- thresholds <- da_peaks %>% mutate(abs_lfc = abs(da_log2FC)) %>% with( seq(min(abs_lfc), max(abs_lfc), length.out = 100) ) ## ----reduce-ex02-------------------------------------------------------------- genes_peak_all_thresholds <- genes_olap_peaks %>% group_by(gene_id, origin, resample) %>% reduce_ranges_directed( value = count_if_above_threshold(da_log2FC, thresholds), threshold = list(thresholds) ) genes_peak_all_thresholds ## ----expand-summarize--------------------------------------------------------- origin_peak_all_thresholds <- genes_peak_all_thresholds %>% expand_ranges() %>% group_by(origin, threshold) %>% summarize( gene_count = sum(value > 0) / n_distinct(resample), peak_count = sum(value) / n_distinct(resample) ) origin_peak_all_thresholds ## ----line-chart, fig.cap = "(ref:linechart)"---------------------------------- origin_threshold_counts <- origin_peak_all_thresholds %>% as.data.frame() %>% tidyr::pivot_longer(cols = -c(origin, threshold), names_to = c("type", "var"), names_sep = "_", values_to = "count") %>% select(-var) origin_threshold_counts %>% filter(type == "peak") %>% tidyr::pivot_wider(names_from = origin, values_from = count) %>% mutate(enrichment = de / not_de) %>% ggplot(aes(x = threshold, y = enrichment)) + geom_line() + labs(x = "logFC threshold", y = "Relative Enrichment") ## ----line-chart2, fig.cap = "(ref:linechart2)"-------------------------------- origin_threshold_counts %>% ggplot(aes(x = threshold, y = count + 1, color = origin, linetype = type)) + geom_line() + scale_y_log10() ## ----eval = FALSE------------------------------------------------------------- # # development version from Github # BiocManager::install("sa-lee/fluentGenomics") # # version available from Bioconductor # BiocManager::install("fluentGenomics") ## ----------------------------------------------------------------------------- sessionInfo()