1 Introduction

This vignette illustrates existing and Bioconductor infrastructure for the visualisation of mass spectrometry and proteomics data. The code details the visualisations presented in

Gatto L, Breckels LM, Naake T, Gibb S. Visualisation of proteomics data using R and Bioconductor. Proteomics. 2015 Feb 18. doi: 10.1002/pmic.201400392. PubMed PMID: 25690415.

NB: I you are interested in R packages for mass spectrometry-based proteomics and metabolomics, see also the R for Mass Spectrometry initiative packages and the tutorial book

1.1 References

1.2 Relevant packages

There are currently 176 Proteomics and 137 MassSpectrometry packages in Bioconductor version 3.20. Other non-Bioconductor packages are described in the r Biocexptpkg("RforProteomics") vignette (open it in R with RforProteomics() or read it online.)

2 Ascombe’s quartet

x1 x2 x3 x4 y1 y2 y3 y4
10 10 10 8 8.04 9.14 7.46 6.58
8 8 8 8 6.95 8.14 6.77 5.76
13 13 13 8 7.58 8.74 12.74 7.71
9 9 9 8 8.81 8.77 7.11 8.84
11 11 11 8 8.33 9.26 7.81 8.47
14 14 14 8 9.96 8.10 8.84 7.04
6 6 6 8 7.24 6.13 6.08 5.25
4 4 4 19 4.26 3.10 5.39 12.50
12 12 12 8 10.84 9.13 8.15 5.56
7 7 7 8 4.82 7.26 6.42 7.91
5 5 5 8 5.68 4.74 5.73 6.89
tab <- matrix(NA, 5, 4)
colnames(tab) <- 1:4
rownames(tab) <- c("var(x)", "mean(x)",
                   "var(y)", "mean(y)",
                   "cor(x,y)")

for (i in 1:4)
    tab[, i] <- c(var(anscombe[, i]),
                  mean(anscombe[, i]),
                  var(anscombe[, i+4]),
                  mean(anscombe[, i+4]),
                  cor(anscombe[, i], anscombe[, i+4]))
1 2 3 4
var(x) 11.0000000 11.0000000 11.0000000 11.0000000
mean(x) 9.0000000 9.0000000 9.0000000 9.0000000
var(y) 4.1272691 4.1276291 4.1226200 4.1232491
mean(y) 7.5009091 7.5009091 7.5000000 7.5009091
cor(x,y) 0.8164205 0.8162365 0.8162867 0.8165214

While the residuals of the linear regression clearly indicate fundamental differences in these data, the most simple and straightforward approach is visualisation to highlight the fundamental differences in the datasets.

ff <- y ~ x

mods <- setNames(as.list(1:4), paste0("lm", 1:4))

par(mfrow = c(2, 2), mar = c(4, 4, 1, 1))
for (i in 1:4) {
    ff[2:3] <- lapply(paste0(c("y","x"), i), as.name)
    plot(ff, data = anscombe, pch = 19, xlim = c(3, 19), ylim = c(3, 13))
    mods[[i]] <- lm(ff, data = anscombe)
    abline(mods[[i]])
}

lm1 lm2 lm3 lm4
0.0390000 1.1390909 -0.5397273 -0.421
-0.0508182 1.1390909 -0.2302727 -1.241
-1.9212727 -0.7609091 3.2410909 0.709
1.3090909 1.2690909 -0.3900000 1.839
-0.1710909 0.7590909 -0.6894545 1.469
-0.0413636 -1.9009091 -1.1586364 0.039
1.2393636 0.1290909 0.0791818 -1.751
-0.7404545 -1.9009091 0.3886364 0.000
1.8388182 0.1290909 -0.8491818 -1.441
-1.6807273 0.7590909 -0.0805455 0.909
0.1794545 -0.7609091 0.2289091 -0.111

3 The MA plot example

The following code chunk connects to the PXD000001 data set on the ProteomeXchange repository and fetches the mzTab file. After missing values filtering, we extract relevant data (log2 fold-changes and log10 mean expression intensities) into data.frames.

library("rpx")
px1 <- PXDataset("PXD000001")
## Loading PXD000001 from cache.
mztab <- pxget(px1, "F063721.dat-mztab.txt")
## Loading F063721.dat-mztab.txt from cache.
library("MSnbase")
## here, we need to specify the (old) mzTab version 0.9
qnt <- readMzTabData(mztab, what = "PEP", version = "0.9")
sampleNames(qnt) <- reporterNames(TMT6)
qnt <- filterNA(qnt)
## may be combineFeatuers

spikes <- c("P02769", "P00924", "P62894", "P00489")
protclasses <- as.character(fData(qnt)$accession)
protclasses[!protclasses %in% spikes] <- "Background"


madata42 <- data.frame(A = rowMeans(log(exprs(qnt[, c(4, 2)]), 10)),
                       M = log(exprs(qnt)[, 4], 2) - log(exprs(qnt)[, 2], 2),
                       data = rep("4vs2", nrow(qnt)),
                       protein = fData(qnt)$accession,
                       class = factor(protclasses))

madata62 <- data.frame(A = rowMeans(log(exprs(qnt[, c(6, 2)]), 10)),
                       M = log(exprs(qnt)[, 6], 2) - log(exprs(qnt)[, 2], 2),
                       data = rep("6vs2", nrow(qnt)),
                       protein = fData(qnt)$accession,
                       class = factor(protclasses))


madata <- rbind(madata42, madata62)

3.1 The traditional plotting system

par(mfrow = c(1, 2))
plot(M ~ A, data = madata42, main = "4vs2",
     xlab = "A", ylab = "M", col = madata62$class)
plot(M ~ A, data = madata62, main = "6vs2",
     xlab = "A", ylab = "M", col = madata62$class)

3.2 lattice

library("lattice")
latma <- xyplot(M ~ A | data, data = madata,
                groups = madata$class,
                auto.key = TRUE)
print(latma)

3.3 ggplot2

library("ggplot2")
ggma <- ggplot(aes(x = A, y = M, colour = class), data = madata,
               colour = class) +
    geom_point() +
    facet_grid(. ~ data)
print(ggma)

3.4 Customization

library("RColorBrewer")
bcols <- brewer.pal(4, "Set1")
cls <- c("Background" = "#12121230",
         "P02769" = bcols[1],
         "P00924" = bcols[2],
         "P62894" = bcols[3],
         "P00489" = bcols[4])
ggma2 <- ggplot(aes(x = A, y = M, colour = class),
                data = madata) + geom_point(shape = 19) +
    facet_grid(. ~ data) + scale_colour_manual(values = cls) +
    guides(colour = guide_legend(override.aes = list(alpha = 1)))
print(ggma2)

3.5 The MAplot method for MSnSet instances

MAplot(qnt, cex = .8)

3.6 An interactive shiny app for MA plots

This (now outdated and deprecated) app is based on Mike Love’s shinyMA application, adapted for a proteomics data. A screen shot is displayed below.

See the excellent shiny web page for tutorials and the Mastering Shiny book for details on shiny.

3.7 Volcano plots

Below, using the msmsTest package, we load a example MSnSet data with spectral couting data (from the r Biocpkg("msmsEDA") package) and run a statistical test to obtain (adjusted) p-values and fold-changes.

library("msmsEDA")
library("msmsTests")
data(msms.dataset)
## Pre-process expression matrix
e <- pp.msms.data(msms.dataset)
## Models and normalizing condition
null.f <- "y~batch"
alt.f <- "y~treat+batch"
div <- apply(exprs(e), 2, sum)
## Test
res <- msms.glm.qlll(e, alt.f, null.f, div = div)
lst <- test.results(res, e, pData(e)$treat, "U600", "U200 ", div,
                    alpha = 0.05, minSpC = 2, minLFC = log2(1.8),
                    method = "BH")

Here, we produce the volcano plot by hand, with the plot function. In the second plot, we limit the x axis limits and add grid lines.

plot(lst$tres$LogFC, -log10(lst$tres$p.value))

plot(lst$tres$LogFC, -log10(lst$tres$p.value),
     xlim = c(-3, 3))
grid()

Below, we use the res.volcanoplot function from the r Biocpkg("msmsTests") package. This functions uses the sample annotation stored with the quantitative data in the MSnSet object to colour the samples according to their phenotypes.

## Plot
res.volcanoplot(lst$tres,
                max.pval = 0.05,
                min.LFC = 1,
                maxx = 3,
                maxy = NULL,
                ylbls = 4)

3.8 A PCA plot

Using the counts.pca function from the msmsEDA package:

library("msmsEDA")
data(msms.dataset)
msnset <- pp.msms.data(msms.dataset)
lst <- counts.pca(msnset, wait = FALSE)

It is also possible to generate the PCA data using the prcomp. Below, we extract the coordinates of PC1 and PC2 from the counts.pca result and plot them using the plot function.

pcadata <- lst$pca$x[, 1:2]
head(pcadata)
##                  PC1       PC2
## U2.2502.1 -120.26080 -53.55270
## U2.2502.2  -99.90618 -53.89979
## U2.2502.3 -127.35928 -49.29906
## U2.2502.4 -166.04611 -39.27557
## U6.2502.1 -127.18423  37.11614
## U6.2502.2 -117.97016  47.03702
plot(pcadata[, 1], pcadata[, 2],
     xlab = "PCA1", ylab = "PCA2")
grid()

4 Plotting with R

kable(plotfuns)
plot type traditional lattice ggplot2
scatterplots plot xyplot geom_point
histograms hist histgram geom_histogram
density plots plot(density()) densityplot geom_density
boxplots boxplot bwplot geom_boxplot
violin plots vioplot::vioplot bwplot(…, panel = panel.violin) geom_violin
line plots plot, matplot xyploy, parallelplot geom_line
bar plots barplot barchart geom_bar
pie charts pie geom_bar with polar coordinates
dot plots dotchart dotplot geom_point
stip plots stripchart stripplot goem_point
dendrogramms plot(hclust()) latticeExtra package ggdendro package
heatmaps image, heatmap levelplot geom_tile

Below, we are going to use a data from the pRolocdata to illustrate the plotting functions.

library("pRolocdata")
data(tan2009r1)

4.1 Scatter plots

See the MA and volcano plot examples.

The default plot type is p, for points. Other important types are l for lines and h for histogram (see below).

4.2 Historams and density plots

We extract the (normalised) intensities of the first sample

x <- exprs(tan2009r1)[, 1]

and plot the distribution with a histogram and a density plot next to each other on the same figure (using the mfrow par plotting paramter)

par(mfrow = c(1, 2))
hist(x)
plot(density(x))

4.3 Box plots and violin plots

we first extract the 888 proteins by r ncol(tan2009r1) samples data matrix and plot the sample distributions next to each other using boxplot and beanplot (from the beanplot package).

library("beanplot")
x <- exprs(tan2009r1)
par(mfrow = c(2, 1))
boxplot(x)
beanplot(x[, 1], x[, 2], x[, 3], x[, 4], log = "")

4.4 Line plots

below, we produce line plots that describe the protein quantitative profiles for two sets of proteins, namely er and mitochondrial proteins using matplot.

we need to transpose the matrix (with t) and set the type to both (b), to display points and lines, the colours to red and steel blue, the point characters to 1 (an empty point) and the line type to 1 (a solid line).

er <- fData(tan2009r1)$markers == "ER"
mt <- fData(tan2009r1)$markers == "mitochondrion"

par(mfrow = c(2, 1))
matplot(t(x[er, ]), type = "b", col = "red", pch = 1, lty = 1)
matplot(t(x[mt, ]), type = "b", col = "steelblue", pch = 1, lty = 1)

In the last section, about spatial proteomics, we use the specialised plotDist function from the pRoloc package to generate such figures.

4.5 Bar and dot charts

To illustrate bar and dot charts, we cound the number of proteins in the respective class using table.

x <- table(fData(tan2009r1)$markers)
x
## 
##  Cytoskeleton            ER         Golgi      Lysosome       Nucleus 
##             7            28            13             8            21 
##            PM    Peroxisome    Proteasome  Ribosome 40S  Ribosome 60S 
##            34             4            15            20            32 
## mitochondrion       unknown 
##            29           677
par(mfrow = c(1, 2))
barplot(x)
dotchart(as.numeric(x))

4.6 Heatmaps

The easiest to produce a complete heatmap is with the heatmap function:

heatmap(exprs(tan2009r1))

To produce the a heatmap without the dendrograms, one can use the image function on a matrix or the specialised version for MSnSet objects from the MSnbase package.

par(mfrow = c(1, 2))
x <- matrix(1:9, ncol = 3)
image(x)
image(tan2009r1)

See also gplots’s heatmap.2 function and the Heatplus Bioconductor package for more advanced heatmaps and the corrplot package for correlation matrices.

4.7 Dendrograms

The easiest way to produce and plot a dendrogram is:

d <- dist(t(exprs(tan2009r1))) ## distance between samples
hc <- hclust(d) ## hierarchical clustering
plot(hc) ## visualisation

See also dendextend and this post to illustrate latticeExtra and ggdendro.

4.8 Venn diagrams

5 Visualising mass spectrometry data

5.1 Direct access to the raw data

library("mzR")
mzf <- pxget(px1,
             "TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML")
## Loading TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01-20141210.mzML from cache.
ms <- openMSfile(mzf)

hd <- header(ms)
ms1 <- which(hd$msLevel == 1)

rtsel <- hd$retentionTime[ms1] / 60 > 30 & hd$retentionTime[ms1] / 60 < 35
lout <- matrix(NA, ncol = 10, nrow = 8)
lout[1:2, ] <- 1
for (ii in 3:4)
    lout[ii, ] <- c(2, 2, 2, 2, 2, 2, 3, 3, 3, 3)
lout[5, ] <- rep(4:8, each = 2)
lout[6, ] <- rep(4:8, each = 2)
lout[7, ] <- rep(9:13, each = 2)
lout[8, ] <- rep(9:13, each = 2)
i <- ms1[which(rtsel)][1]
j <- ms1[which(rtsel)][2]
ms2 <- (i+1):(j-1)
layout(lout)
par(mar=c(4,2,1,1))
plot(chromatogram(ms)[[1]], type = "l")
abline(v = hd[i, "retentionTime"], col = "red")
par(mar = c(3, 2, 1, 0))
plot(peaks(ms, i), type = "l", xlim = c(400, 1000))
legend("topright", bty = "n",
       legend = paste0(
           "Acquisition ", hd[i, "acquisitionNum"],  "\n",
           "Retention time ", formatRt(hd[i, "retentionTime"])))
abline(h = 0)
abline(v = hd[ms2, "precursorMZ"],
       col = c("#FF000080",
           rep("#12121280", 9)))
par(mar = c(3, 0.5, 1, 1))
plot(peaks(ms, i), type = "l", xlim = c(521, 522.5), yaxt = "n")
abline(h = 0)
abline(v = hd[ms2, "precursorMZ"], col = "#FF000080")
par(mar = c(2, 2, 0, 1))
for (ii in ms2) {
    p <- peaks(ms, ii)
    plot(p, xlab = "", ylab = "", type = "h", cex.axis = .6)
    legend("topright",
           legend = paste0("Prec M/Z\n", round(hd[ii, "precursorMZ"], 2)),
           bty = "n", cex = .8)
}
Accesing and plotting MS data.

Figure 1: Accesing and plotting MS data

M2 <- MSmap(ms, i:j, 100, 1000, 1, hd)
plot3D(M2)

5.1.1 MS barcoding

par(mar=c(4,1,1,1))
image(t(matrix(hd$msLevel, 1, nrow(hd))),
      xlab="Retention time",
      xaxt="n", yaxt="n", col=c("black","steelblue"))
k <- round(range(hd$retentionTime) / 60)
nk <- 5
axis(side=1, at=seq(0,1,1/nk), labels=seq(k[1],k[2],k[2]/nk))

5.1.2 Animation

The following animation scrolls over 5 minutes of retention time for a MZ range between 521 and 523.

library("animation")
an1 <- function() {
    for (i in seq(0, 5, 0.2)) {
        rtsel <- hd$retentionTime[ms1] / 60 > (30 + i) &
            hd$retentionTime[ms1] / 60 < (35 + i)
        M <- MSmap(ms, ms1[rtsel], 521, 523, .005, hd)
        M@map[msMap(M) == 0] <- NA
        print(plot3D(M, rgl = FALSE))
    }
}

saveGIF(an1(), movie.name = "msanim1.gif")
knitr::include_graphics("./figures/msanim1.gif")