<!– badges: start –> Lifecycle:maturing <!– badges: end –>

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

Create tidybulk tibble.

tt = counts_mini %>% tidybulk(sample, transcript, count)

Aggregate duplicated transcripts

Tidy transcriptomics “{.r .yellow} tt.aggr = tt %>% aggregate_duplicates() ”
Base R “r temp = data.frame( symbol = dge_list$genes$symbol, dge_list$counts ) dge_list.nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats:::colSums(as.matrix(df[,-1])) ) dge_list.nr <- do.call("rbind", dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list) ```

Scale counts

Tidy transcriptomics ”r tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance() “
Base R ”r library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] [...] dgList <- calcNormFactors(dgList, method="TMM") norm_counts.table <- cpm(dgList) ```

Filter variable transcripts

We may want to identify and filter variable transcripts.

Tidy transcriptomics “r tt.norm.variable = tt.norm %>% keep_variable() ”
Base R “r library(edgeR) x = norm_counts.table s <- rowMeans((x-rowMeans(x))^2) o <- order(s,decreasing=TRUE) x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] norm_counts.table$cell_type = tidybulk::counts[ match( tidybulk::counts$sample, rownames(norm_counts.table) ), "Cell type" ] ```

Reduce dimensions

Tidy transcriptomics ”r tt.norm.MDS = tt.norm %>% reduce_dimensions(method=“MDS”, .dims = 2) “
Base R ”r library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale.out %>% setNames(sprintf(“Dim%s”, 1:6)) cmds$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(cmds)), “Cell type” ] “

PCA

Tidy transcriptomics ”r tt.norm.PCA = tt.norm %>% reduce_dimensions(method=“PCA”, .dims = 2) “
Base R ”r count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ match(counts$sample, rownames(pc)), “Cell type” ] “

tSNE

Tidy transcriptomics ”r tt.norm.tSNE = breast_tcga_mini %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( method = “tSNE”, perplexity=10, pca_scale =TRUE ) “
Base R ”r count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne( t(count_m_log), perplexity=10, pca_scale =TRUE )$Y tsne$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(tsne)), “Cell type” ] “

Rotate dimensions

Tidy transcriptomics ”r tt.norm.MDS.rotated = tt.norm.MDS %>% rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action=“get”) “
Base R ”r rotation = function(m, d) { r = d * pi / 180 ((bind_rows( c(1 = cos®, 2 = -sin®), c(1 = sin®, 2 = cos®) ) %>% as_matrix) %*% m) } mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), “Cell type” ] “

Test differential abundance

Tidy transcriptomics ”r tt.de = tt %>% test_differential_abundance( ~ condition, action=“get”) tt.de “
Base R ”r library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf) ```

Adjust counts

Tidy transcriptomics “r tt.norm.adj = tt.norm %>% adjust_abundance( ~ condition + time) ”
Base R “r library(sva) count_m_log = log(count_m + 1) design = model.matrix( object = ~ condition + time, data = annotation ) count_m_log.sva = ComBat( batch = design[,2], mod = design, … ) count_m_log.sva = ceiling(exp(count_m_log.sva) -1) count_m_log.sva$cell_type = counts[ match(counts$sample, rownames(count_m_log.sva)), "Cell type” ] “

Deconvolve Cell type composition

Tidy transcriptomics ”r tt.cibersort = tt %>% deconvolve_cellularity(action=“get”, cores=1) “
Base R ”r source(‘CIBERSORT.R’) count_m %>% write.table(“mixture_file.txt”) results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", perm=100, QN=TRUE ) results$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(results)), "Cell type" ] ```

Cluster samples

k-means

Tidy transcriptomics “r tt.norm.cluster = tt.norm.MDS %>% cluster_elements(method="kmeans”, centers = 2, action=“get” ) “
Base R ”r count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, …) cluster = k$cluster cluster$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(cluster)), c(“Cell type”, “Dim1”, “Dim2”) ] “

SNN

Tidy transcriptomics ”r tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = “SNN”) “
Base R ”r library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData( snn, display.progress = TRUE, num.cores=4, do.par = TRUE ) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = “igraph”, …) snn = snn[[“seurat_clusters”]] snn$cell_type = tidybulk::counts[ match(tidybulk::counts$sample, rownames(snn)), c(“Cell type”, “Dim1”, “Dim2”) ] “

Drop redundant transcripts

Tidy transcriptomics ”r tt.norm.non_redundant = tt.norm.MDS %>% remove_redundancy( method = “correlation” ) “
Base R ”r library(widyr) .data.correlated = pairwise_cor( counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE ) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!!.element := item1) # Return non redundant data frame counts %>% anti_join(.data.correlated) %>% spread(sample, rc, - transcript) %>% left_join(annotation) “

Draw heatmap

tidytranscriptomics ”r tt.norm.MDS %>% # filter lowly abundant keep_abundant() %>% # extract 500 most variable genes keep_variable( .abundance = count_scaled, top = 500) %>% # create heatmap as_tibble() %>% heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(Cell type) “
Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$`Cell type`) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Draw density plot

tidytranscriptomics “r # Example taken from airway dataset from BioC2020 workshop. airway %>% tidybulk() %>% identify_abundant() %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts”), names_to = “source”, values_to = “abundance”) %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() “
Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$dex) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Appendix

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.12-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.12-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tidybulk_1.2.1 ggrepel_0.9.1  ggplot2_3.3.3  magrittr_2.0.1 tibble_3.1.0  
## [6] tidyr_1.1.3    dplyr_1.0.5    knitr_1.31    
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.2.1             tidytext_0.3.0             
##   [3] plyr_1.8.6                  igraph_1.2.6               
##   [5] lazyeval_0.2.2              splines_4.0.4              
##   [7] BiocParallel_1.24.1         listenv_0.8.0              
##   [9] SnowballC_0.7.0             scattermore_0.7            
##  [11] GenomeInfoDb_1.26.6         sva_3.38.0                 
##  [13] digest_0.6.27               htmltools_0.5.1.1          
##  [15] fansi_0.4.2                 memoise_2.0.0              
##  [17] tensor_1.5                  cluster_2.1.1              
##  [19] ROCR_1.0-11                 limma_3.46.0               
##  [21] globals_0.14.0              readr_1.4.0                
##  [23] annotate_1.68.0             matrixStats_0.58.0         
##  [25] spatstat.sparse_2.0-0       colorspace_2.0-0           
##  [27] blob_1.2.1                  xfun_0.22                  
##  [29] crayon_1.4.1                RCurl_1.98-1.3             
##  [31] jsonlite_1.7.2              genefilter_1.72.1          
##  [33] spatstat.data_2.1-0         survival_3.2-10            
##  [35] zoo_1.8-9                   glue_1.4.2                 
##  [37] polyclip_1.10-0             gtable_0.3.0               
##  [39] zlibbioc_1.36.0             XVector_0.30.0             
##  [41] leiden_0.3.7                DelayedArray_0.16.3        
##  [43] future.apply_1.7.0          BiocGenerics_0.36.0        
##  [45] abind_1.4-5                 scales_1.1.1               
##  [47] DBI_1.1.1                   edgeR_3.32.1               
##  [49] miniUI_0.1.1.1              Rcpp_1.0.6                 
##  [51] widyr_0.1.3                 viridisLite_0.3.0          
##  [53] xtable_1.8-4                reticulate_1.18            
##  [55] spatstat.core_2.0-0         bit_4.0.4                  
##  [57] proxy_0.4-25                preprocessCore_1.52.1      
##  [59] stats4_4.0.4                htmlwidgets_1.5.3          
##  [61] httr_1.4.2                  RColorBrewer_1.1-2         
##  [63] ellipsis_0.3.1              Seurat_4.0.1               
##  [65] ica_1.0-2                   pkgconfig_2.0.3            
##  [67] XML_3.99-0.6                uwot_0.1.10                
##  [69] deldir_0.2-10               locfit_1.5-9.4             
##  [71] utf8_1.2.1                  tidyselect_1.1.0           
##  [73] rlang_0.4.10                reshape2_1.4.4             
##  [75] later_1.1.0.1               AnnotationDbi_1.52.0       
##  [77] munsell_0.5.0               tools_4.0.4                
##  [79] cachem_1.0.4                cli_2.4.0                  
##  [81] generics_0.1.0              RSQLite_2.2.5              
##  [83] broom_0.7.6                 ggridges_0.5.3             
##  [85] evaluate_0.14               stringr_1.4.0              
##  [87] fastmap_1.1.0               goftest_1.2-2              
##  [89] bit64_4.0.5                 fitdistrplus_1.1-3         
##  [91] purrr_0.3.4                 RANN_2.6.1                 
##  [93] pbapply_1.4-3               future_1.21.0              
##  [95] nlme_3.1-152                mime_0.10                  
##  [97] tokenizers_0.2.1            debugme_1.1.0              
##  [99] compiler_4.0.4              rstudioapi_0.13            
## [101] plotly_4.9.3                png_0.1-7                  
## [103] e1071_1.7-6                 spatstat.utils_2.1-0       
## [105] stringi_1.5.3               ps_1.6.0                   
## [107] lattice_0.20-41             Matrix_1.3-2               
## [109] vctrs_0.3.7                 pillar_1.5.1               
## [111] lifecycle_1.0.0             spatstat.geom_2.0-1        
## [113] lmtest_0.9-38               RcppAnnoy_0.0.18           
## [115] data.table_1.14.0           cowplot_1.1.1              
## [117] bitops_1.0-6                irlba_2.3.3                
## [119] httpuv_1.5.5                patchwork_1.1.1            
## [121] GenomicRanges_1.42.0        R6_2.5.0                   
## [123] promises_1.2.0.1            KernSmooth_2.23-18         
## [125] gridExtra_2.3               janeaustenr_0.1.5          
## [127] IRanges_2.24.1              parallelly_1.24.0          
## [129] codetools_0.2-18            MASS_7.3-53.1              
## [131] assertthat_0.2.1            SummarizedExperiment_1.20.0
## [133] withr_2.4.1                 SeuratObject_4.0.0         
## [135] sctransform_0.3.2           S4Vectors_0.28.1           
## [137] GenomeInfoDbData_1.2.4      mgcv_1.8-34                
## [139] parallel_4.0.4              hms_1.0.0                  
## [141] rpart_4.1-15                grid_4.0.4                 
## [143] class_7.3-18                MatrixGenerics_1.2.1       
## [145] Rtsne_0.15                  Biobase_2.50.0             
## [147] shiny_1.6.0