Contents

1 Examples and use cases

This vignette demonstrates several examples and use cases for the datasets in the HDCytoData package.

2 Dimension reduction

Using the clustering datasets, we can generate dimension reduction plots with colors indicating the ground truth cell population labels. This provides a visual representation of the cell population structure in these datasets, which is useful during exploratory data analysis and for representing the output of clustering or other downstream analysis algorithms.

Below, we compare three different dimension reduction algorithms (principal component analysis [PCA], t-distributed stochastic neighbor embedding [tSNE], and uniform manifold approximation and projection [UMAP]), for one of the datasets (Levine_32dim). This dataset contains ground truth cell population labels for 14 immune cell populations.

suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
# ---------
# Load data
# ---------

d_SE <- Levine_32dim_SE()
## snapshotDate(): 2020-10-02
## see ?HDCytoData and browseVignettes('HDCytoData') for documentation
## loading from cache
# -------------
# Preprocessing
# -------------

# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])

# extract cell population labels
population <- rowData(d_SE)$population_id

dim(d_sub)
## [1] 265627     32
stopifnot(nrow(d_sub) == length(population))

# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)

summary(d_sub)
##      CD45RA             CD133               CD19               CD22         
##  Min.   :-0.05731   Min.   :-0.05808   Min.   :-0.05809   Min.   :-0.05734  
##  1st Qu.: 0.20463   1st Qu.:-0.02294   1st Qu.:-0.01884   1st Qu.:-0.02069  
##  Median : 0.54939   Median : 0.02535   Median : 0.07521   Median : 0.05879  
##  Mean   : 0.68813   Mean   : 0.14596   Mean   : 0.50930   Mean   : 0.39732  
##  3rd Qu.: 1.03120   3rd Qu.: 0.22430   3rd Qu.: 0.54839   3rd Qu.: 0.38648  
##  Max.   : 6.69120   Max.   : 5.52750   Max.   : 4.99008   Max.   : 5.16048  
##      CD11b                CD4                CD8                CD34         
##  Min.   :-0.058236   Min.   :-0.05775   Min.   :-0.05800   Min.   :-0.05801  
##  1st Qu.:-0.000294   1st Qu.:-0.01259   1st Qu.:-0.01732   1st Qu.:-0.01117  
##  Median : 0.257923   Median : 0.13122   Median : 0.07363   Median : 0.11071  
##  Mean   : 0.710319   Mean   : 0.36760   Mean   : 0.56522   Mean   : 0.33989  
##  3rd Qu.: 0.923517   3rd Qu.: 0.57812   3rd Qu.: 0.48642   3rd Qu.: 0.39281  
##  Max.   : 5.260789   Max.   : 6.58176   Max.   : 4.69369   Max.   : 5.14800  
##       Flt3                CD20              CXCR4             CD235ab        
##  Min.   :-0.057884   Min.   :-0.05813   Min.   :-0.05704   Min.   :-0.05761  
##  1st Qu.:-0.007793   1st Qu.:-0.02207   1st Qu.: 0.25290   1st Qu.: 0.23100  
##  Median : 0.110317   Median : 0.03382   Median : 0.66539   Median : 0.54043  
##  Mean   : 0.229768   Mean   : 0.38441   Mean   : 0.79247   Mean   : 0.63189  
##  3rd Qu.: 0.336117   3rd Qu.: 0.32551   3rd Qu.: 1.20168   3rd Qu.: 0.92358  
##  Max.   : 7.117323   Max.   : 6.05141   Max.   : 5.69667   Max.   : 6.64670  
##       CD45           CD123              CD321               CD14          
##  Min.   :2.040   Min.   :-0.05800   Min.   :-0.05355   Min.   :-0.057954  
##  1st Qu.:5.116   1st Qu.:-0.01162   1st Qu.: 1.32346   1st Qu.:-0.026326  
##  Median :5.645   Median : 0.09602   Median : 1.90479   Median :-0.005379  
##  Mean   :5.408   Mean   : 0.37241   Mean   : 1.93542   Mean   : 0.077030  
##  3rd Qu.:5.939   3rd Qu.: 0.41310   3rd Qu.: 2.51781   3rd Qu.: 0.089789  
##  Max.   :7.238   Max.   : 6.64063   Max.   : 6.86739   Max.   : 5.006121  
##       CD33               CD47              CD11c                CD7          
##  Min.   :-0.05808   Min.   :-0.05509   Min.   :-0.058053   Min.   :-0.05816  
##  1st Qu.:-0.01813   1st Qu.: 2.08788   1st Qu.:-0.002711   1st Qu.:-0.01567  
##  Median : 0.06107   Median : 2.71442   Median : 0.212063   Median : 0.13002  
##  Mean   : 0.30792   Mean   : 2.65608   Mean   : 0.703504   Mean   : 0.81384  
##  3rd Qu.: 0.34147   3rd Qu.: 3.27654   3rd Qu.: 0.861448   3rd Qu.: 1.37083  
##  Max.   : 5.61247   Max.   : 6.40249   Max.   : 6.520939   Max.   : 6.31922  
##       CD15               CD16               CD44              CD38         
##  Min.   :-0.05808   Min.   :-0.05778   Min.   :0.02606   Min.   :-0.05719  
##  1st Qu.:-0.01502   1st Qu.:-0.02255   1st Qu.:3.12712   1st Qu.: 0.40198  
##  Median : 0.09355   Median : 0.01424   Median :3.87967   Median : 1.02032  
##  Mean   : 0.23136   Mean   : 0.16123   Mean   :3.76018   Mean   : 1.47781  
##  3rd Qu.: 0.38331   3rd Qu.: 0.16077   3rd Qu.:4.47392   3rd Qu.: 2.19146  
##  Max.   : 1.53415   Max.   : 5.33831   Max.   :7.40456   Max.   : 7.29308  
##       CD13               CD3                CD61              CD117         
##  Min.   :-0.05773   Min.   :-0.05824   Min.   :-0.05764   Min.   :-0.05767  
##  1st Qu.: 0.02110   1st Qu.: 0.08495   1st Qu.:-0.01285   1st Qu.:-0.02396  
##  Median : 0.18706   Median : 0.60376   Median : 0.09569   Median :-0.00041  
##  Mean   : 0.36856   Mean   : 2.16576   Mean   : 0.34446   Mean   : 0.13120  
##  3rd Qu.: 0.53550   3rd Qu.: 4.66522   3rd Qu.: 0.41579   3rd Qu.: 0.15474  
##  Max.   : 6.98119   Max.   : 6.74836   Max.   : 7.74850   Max.   : 5.50213  
##      CD49d              HLA-DR              CD64               CD41         
##  Min.   :-0.05806   Min.   :-0.05797   Min.   :-0.05820   Min.   :-0.05824  
##  1st Qu.: 0.28301   1st Qu.: 0.05771   1st Qu.:-0.01058   1st Qu.:-0.02017  
##  Median : 0.67721   Median : 0.61133   Median : 0.12249   Median : 0.05223  
##  Mean   : 0.79494   Mean   : 1.52181   Mean   : 0.55151   Mean   : 0.26175  
##  3rd Qu.: 1.19079   3rd Qu.: 2.88824   3rd Qu.: 0.60413   3rd Qu.: 0.30559  
##  Max.   : 5.15344   Max.   : 7.05251   Max.   : 4.51784   Max.   : 7.71829
# subsample cells for faster runtimes in vignette
n <- 2000

set.seed(123)
ix <- sample(seq_len(nrow(d_sub)), n)

d_sub <- d_sub[ix, ]
population <- population[ix]

dim(d_sub)
## [1] 2000   32
stopifnot(nrow(d_sub) == length(population))

# remove any near-duplicate rows (required by Rtsne)
dups <- duplicated(d_sub)
d_sub <- d_sub[!dups, ]
population <- population[!dups]

dim(d_sub)
## [1] 1998   32
stopifnot(nrow(d_sub) == length(population))
# ------------------------
# Dimension reduction: PCA
# ------------------------

n_dims <- 2

# run PCA
# (note: no scaling, since asinh-transformed dimensions are already comparable)
out_PCA <- prcomp(d_sub, center = TRUE, scale. = FALSE)
dims_PCA <- out_PCA$x[, seq_len(n_dims)]
colnames(dims_PCA) <- c("PC_1", "PC_2")
head(dims_PCA)
##           PC_1       PC_2
## [1,]  1.450702  3.1573053
## [2,]  2.453109 -0.9381139
## [3,] -2.705226  0.7090551
## [4,]  2.718284 -2.2801305
## [5,] -2.714230 -0.1954170
## [6,] -3.003650 -0.1938087
stopifnot(nrow(dims_PCA) == length(population))

colnames(dims_PCA) <- c("dimension_x", "dimension_y")
dims_PCA <- cbind(as.data.frame(dims_PCA), population, type = "PCA")

head(dims_PCA)
##   dimension_x dimension_y population type
## 1    1.450702   3.1573053 unassigned  PCA
## 2    2.453109  -0.9381139 unassigned  PCA
## 3   -2.705226   0.7090551 unassigned  PCA
## 4    2.718284  -2.2801305 unassigned  PCA
## 5   -2.714230  -0.1954170 unassigned  PCA
## 6   -3.003650  -0.1938087 unassigned  PCA
str(dims_PCA)
## 'data.frame':    1998 obs. of  4 variables:
##  $ dimension_x: num  1.45 2.45 -2.71 2.72 -2.71 ...
##  $ dimension_y: num  3.157 -0.938 0.709 -2.28 -0.195 ...
##  $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
##  $ type       : chr  "PCA" "PCA" "PCA" "PCA" ...
# generate plot
d_plot <- dims_PCA
str(d_plot)
## 'data.frame':    1998 obs. of  4 variables:
##  $ dimension_x: num  1.45 2.45 -2.71 2.72 -2.71 ...
##  $ dimension_y: num  3.157 -0.938 0.709 -2.28 -0.195 ...
##  $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
##  $ type       : chr  "PCA" "PCA" "PCA" "PCA" ...
colors <- c(rainbow(14), "gray75")

ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + 
  facet_wrap(~ type, scales = "free") + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  labs(x = "dimension x", y = "dimension y") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))

# -------------------------
# Dimension reduction: tSNE
# -------------------------

# run Rtsne
set.seed(123)
out_Rtsne <- Rtsne(as.matrix(d_sub), dims = n_dims)
dims_Rtsne <- out_Rtsne$Y
colnames(dims_Rtsne) <- c("tSNE_1", "tSNE_2")
head(dims_Rtsne)
##          tSNE_1     tSNE_2
## [1,]  21.317627  -9.102072
## [2,]  -6.457918  21.639648
## [3,]  -4.640114 -21.309045
## [4,]  -5.916015  26.265756
## [5,] -18.297775 -15.155949
## [6,]  -5.113306 -13.962952
stopifnot(nrow(dims_Rtsne) == length(population))

colnames(dims_Rtsne) <- c("dimension_x", "dimension_y")
dims_Rtsne <- cbind(as.data.frame(dims_Rtsne), population, type = "tSNE")

head(dims_Rtsne)
##   dimension_x dimension_y population type
## 1   21.317627   -9.102072 unassigned tSNE
## 2   -6.457918   21.639648 unassigned tSNE
## 3   -4.640114  -21.309045 unassigned tSNE
## 4   -5.916015   26.265756 unassigned tSNE
## 5  -18.297775  -15.155949 unassigned tSNE
## 6   -5.113306  -13.962952 unassigned tSNE
str(dims_Rtsne)
## 'data.frame':    1998 obs. of  4 variables:
##  $ dimension_x: num  21.32 -6.46 -4.64 -5.92 -18.3 ...
##  $ dimension_y: num  -9.1 21.6 -21.3 26.3 -15.2 ...
##  $ population : Factor w/ 15 levels "Basophils","CD16-_NK_cells",..: 15 15 15 15 15 15 15 9 10 10 ...
##  $ type       : chr  "tSNE" "tSNE" "tSNE" "tSNE" ...
# generate plot
d_plot <- dims_Rtsne

ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + 
  facet_wrap(~ type, scales = "free") + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  labs(x = "dimension x", y = "dimension y") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))