The following content was used to compile the Supplementary Note 1 in Egloff et al. (2018) (under review NMETH-A35040) to demonstrate the robustness of protein detection via flycodes.
library(lattice)
library(knitr)
library(parallel)
library(NestLink)
cv <- 1 - 1:7 / 10
t <- trellis.par.get("strip.background")
t$col <- (rgb(cv,cv,cv))
trellis.par.set("strip.background", t)
n.simulation <- 10
# filename <- system.file("extdata/PGexport2_normalizedAgainstSBstandards_Peptides.csv",
# package = "NestLink")
# library(ExperimentHub)
# eh <- ExperimentHub()
# filename <- query(eh, c("NestLink", "PGexport2_normalizedAgainstSBstandards_Peptides.csv"))[[1]]
filename <- getExperimentHubFilename("PGexport2_normalizedAgainstSBstandards_Peptides.csv")
P <- read.csv(filename,
header = TRUE, sep=';')
dim(P)
## [1] 721 27
remove modifications
P <- P[P$Modifications == '', ]
dim(P)
## [1] 697 27
select rows
P <- P[,c('Accession', 'Sequence', "X20170919_05_62465_nl5idx1.3_6titratecoli",
"X20170919_16_62465_nl5idx1.3_6titratecoli",
"X20170919_09_62466_nl5idx1.3_7titratesmeg",
"X20170919_14_62466_nl5idx1.3_7titratesmeg")]
dim(P)
## [1] 697 6
rename column names
names(P)<-c('Accession','Sequence','coli1', 'coli2', 'smeg1', 'smeg2')
dim(P)
## [1] 697 6
remove all rows with invalid identidfier
P<- P[grep("^P[0-9][A-Z][0-9]", P$Accession), ]
P<-droplevels(P)
add flycode set annotation
P$ConcGr <- NA
P$ConcGr[P$Accession %in% c('P1A4', 'P1B4', 'P1C4', 'P1D4', 'P1E4', 'P1F4')] <- 92
P$ConcGr[P$Accession %in% c('P1A5', 'P1B5', 'P1C5', 'P1D5', 'P1G4', 'P1H4')] <- 295
P$ConcGr[P$Accession %in% c('P1A6', 'P1B6', 'P1E5', 'P1F5', 'P1G5', 'P1H5')] <- 943
P$ConcGr[P$Accession %in% c('P1C6', 'P1D6', 'P1E6', 'P1F6', 'P1G6', 'P1H6')] <- 3017
table(P$ConcGr)
##
## 92 295 943 3017
## 82 122 135 165
Pabs <- P
table(P$Accession)
##
## P1A4 P1A5 P1A6 P1B4 P1B5 P1B6 P1C4 P1C5 P1C6 P1D4 P1D5 P1D6 P1E4 P1E5 P1E6 P1F4
## 11 22 24 15 22 23 16 19 26 15 16 29 13 23 30 12
## P1F5 P1F6 P1G4 P1G5 P1G6 P1H4 P1H5 P1H6
## 19 28 22 24 24 21 22 28
P.summary <- aggregate(. ~ ConcGr * Accession, data=P[,c('Accession', 'coli1',
'coli2', 'smeg1', 'smeg2', 'ConcGr')], FUN=sum)
P.summary$nDetectedFlycodes <- aggregate(Sequence ~ ConcGr * Accession,
data=na.omit(P), FUN=length)[,3]
P.summary$nTotalFlycodes <- 30
P.summary$coverage <- round(100 * P.summary$nDetectedFlycodes / P.summary$nTotalFlycodes)
kable(P.summary[order(P.summary$ConcGr),], row.names = FALSE)
ConcGr | Accession | coli1 | coli2 | smeg1 | smeg2 | nDetectedFlycodes | nTotalFlycodes | coverage |
---|---|---|---|---|---|---|---|---|
92 | P1A4 | 3473996 | 3670152 | 3524234 | 3629077 | 11 | 30 | 37 |
92 | P1B4 | 3972002 | 3951215 | 3685668 | 3632472 | 15 | 30 | 50 |
92 | P1C4 | 5582053 | 5716917 | 5700899 | 5623891 | 16 | 30 | 53 |
92 | P1D4 | 5188468 | 5642192 | 5245252 | 5281381 | 15 | 30 | 50 |
92 | P1E4 | 4697745 | 4824760 | 5051017 | 5146804 | 13 | 30 | 43 |
92 | P1F4 | 3968239 | 4026336 | 4246879 | 4251951 | 12 | 30 | 40 |
295 | P1A5 | 23970332 | 24546550 | 26479936 | 25950084 | 22 | 30 | 73 |
295 | P1B5 | 16033017 | 16352277 | 17209165 | 17490691 | 22 | 30 | 73 |
295 | P1C5 | 24016078 | 24734329 | 22810916 | 22589773 | 19 | 30 | 63 |
295 | P1D5 | 17658145 | 17864382 | 17760463 | 17773297 | 16 | 30 | 53 |
295 | P1G4 | 16016015 | 16618336 | 18270569 | 19036154 | 22 | 30 | 73 |
295 | P1H4 | 19519544 | 20301903 | 20111340 | 20174474 | 21 | 30 | 70 |
943 | P1A6 | 65538576 | 67507722 | 66123186 | 66400185 | 24 | 30 | 80 |
943 | P1B6 | 55078431 | 58048226 | 50652047 | 50278919 | 23 | 30 | 77 |
943 | P1E5 | 60305499 | 62836186 | 62173846 | 61952054 | 23 | 30 | 77 |
943 | P1F5 | 46800670 | 49088967 | 47748401 | 47930977 | 19 | 30 | 63 |
943 | P1G5 | 54312219 | 55033298 | 51968903 | 51106582 | 24 | 30 | 80 |
943 | P1H5 | 59931716 | 61370897 | 55971370 | 56588298 | 22 | 30 | 73 |
3017 | P1C6 | 168463728 | 180586807 | 158342929 | 158748952 | 26 | 30 | 87 |
3017 | P1D6 | 159828074 | 163103139 | 140519542 | 138765621 | 29 | 30 | 97 |
3017 | P1E6 | 166626467 | 176697478 | 159850632 | 161744675 | 30 | 30 | 100 |
3017 | P1F6 | 191955804 | 203793127 | 181359940 | 182939796 | 28 | 30 | 93 |
3017 | P1G6 | 183248427 | 189386707 | 177588746 | 178178834 | 24 | 30 | 80 |
3017 | P1H6 | 181057205 | 185326019 | 161715001 | 152753819 | 28 | 30 | 93 |
# write.csv(P.summary, file='Figs/FigureS6b.csv')
The following function defines the computation and rendering of the by the
paris
function called panels.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
rv <-lapply(unique(P$ConcGr), function(q){
pairs((P[P$ConcGr == q ,c('coli1', 'coli2', 'smeg1', 'smeg2')]),
pch=16, col=rgb(0.5,0.5,0.5,alpha = 0.3),
lower.panel = panel.cor,
asp=1,
main=q)
})
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
FCfill2max
to conduct a fair random experiment we add dummy flycodes
to the input data.frame
.
FCfill2max <- function(P, n = max(table(P$Accession))){
for (i in unique(P$Accession)){
idx <- which(P$Accession == i)
# determine the number of missing rows for Accession i
ndiff <- n - length(idx)
if(length(idx) < n){
cand <- P[idx[1], ]
cand[,2 ] <- "xxx"
cand[,3:6 ] <- NA
for (j in 1:ndiff){
P <- rbind(P, cand)
}
}
}
P
}
P.mean <- apply(P[, c('coli1', 'coli2', 'smeg1', 'smeg2')],1, FUN=mean)
P.sd <- apply(P[, c('coli1', 'coli2', 'smeg1', 'smeg2')],1, FUN=sd)
boxplot(100*P.sd/P.mean ~ P$ConcGr,log='y', ylab='CV%')
P <- FCfill2max(P)
FlycodeAbsoluteStatistics
using coli1
columnFlycodeAbsoluteStatistics <- function(P){
PP <- aggregate(P$coli1 ~ P$Accession + P$ConcGr, FUN=sum)
names(PP) <- c('Accession', 'ConcGr', 'coli1_sum')
PPP <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=mean)
P.mean <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=mean)
P.sd <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=sd)
P.cv <- aggregate(PP$coli1_sum ~ PP$ConcGr,
FUN = function(x){ 100 * sd(x) / mean(x) })
P.length <- max(aggregate(P$coli1 ~ P$Accession, FUN=length)[,2])
rv <- data.frame(ConcGr=P.sd[,1],
mean=P.mean[,2],
sd=P.sd[,2],
cv=round(P.cv[,2],2))
rv$nFCAccession <- P.length
rv
}
kable(FlycodeAbsoluteStatistics(P))
ConcGr | mean | sd | cv | nFCAccession |
---|---|---|---|---|
92 | 4480417 | 811894.8 | 18.12 | 30 |
295 | 19535522 | 3685706.9 | 18.87 | 30 |
943 | 56994519 | 6440051.4 | 11.30 | 30 |
3017 | 175196618 | 12124519.7 | 6.92 | 30 |
FCrandom
# TODO(cp); make it work for n = 0
FCrandom <- function(P, n = 1){
if(n == 0){
return (P)
}
for (i in unique(P$Accession)){
idx <- which(P$Accession == i)
stopifnot(length(idx) >= n)
smp <- sample(length(idx), n)
P <- P[-idx[smp],]
}
P$n <- n
P
}
set.seed(8)
S <- do.call('rbind', lapply(1:29, function(i){
FlycodeAbsoluteStatistics(FCrandom(P, i))
}))
xyplot(cv ~ nFCAccession | ConcGr,
data = S,
layout = c(4, 1),
strip = strip.custom(strip.names = TRUE, strip.levels = TRUE)
)
set.seed(1)
S.rep <- do.call('rbind',
lapply(1:n.simulation, function(s){
S <- do.call('rbind',
lapply(1:29, function(i){
FlycodeAbsoluteStatistics(FCrandom(P, i))
}))
}))
NestLink_absolute_flycode_simulation <- xyplot(cv ~ nFCAccession | ConcGr,
data = S.rep,
panel = function(x,y,...){
panel.abline(h=c(10, 50), col='red')
panel.xyplot(x, y, ...)
xy.median <- (aggregate(y, by=list(x), FUN=median, na.rm = TRUE))
xy.quantile <- aggregate(y, by=list(x), FUN=function(d){quantile(d, c(0.05, 0.5, 0.95), na.rm = TRUE)})
panel.points( xy.median[,1], xy.median[,2], pch=16, cex=0.5)
# print((xy.quantile[,2])[,3])
panel.points( xy.quantile[,1],(xy.quantile[,2])[,1], pch='-')
panel.points( xy.quantile[,1],(xy.quantile[,2])[,3], pch='-')
},
xlab= 'Number of flycodes',
ylab ='CV [%]',
strip = strip.custom(strip.names = TRUE, strip.levels = TRUE),
scales=list(x=list(at=c(1,5,10,15,20,25,30)),
y=list(at=c(0,10,50,100,150,200))),
ylim=c(0,175),
pch=16,
col=rgb(0.5, 0.5, 0.5, alpha = 0.01),
layout = c(4,1))
print(NestLink_absolute_flycode_simulation)
# png("NestLink_absolute_flycode_simulation.png", 1200,800)
# print(NestLink_absolute_flycode_simulation)
# dev.off()
P <- na.omit(P)
P <- P[P$coli1 >0,]
P$coli1 <- P$coli1 / sum(P$coli1)
P$coli2 <- P$coli2 / sum(P$coli2)
P$smeg1 <- P$smeg1 / sum(P$smeg1)
P$smeg2 <- P$smeg2 / sum(P$smeg2)
sanity check
sum(P$coli1)
## [1] 1
sum(P$coli2)
## [1] 1
sum(P$smeg1)
## [1] 1
sum(P$smeg2)
## [1] 1
P$ratios <- (0.5* (P$coli1 + P$coli2)) / (0.5 * (P$smeg1 + P$smeg2))
b <- boxplot(P$coli1 / P$coli2, P$coli1 / P$smeg1, P$coli1 / P$smeg2,P$coli2 / P$smeg1, P$coli2 / P$smeg2 , P$smeg1 / P$smeg2, P$ratios,
ylab='ratios', ylim = c(0,2 ))
axis(1, 1:6, c('coli[12]', 'coli1-smeg1', 'coli1-smeg2', 'coli2-smeg1', 'coli2- smeg2','smeg[12]'))
op <- par(mfrow = c(1, 4))
boxplot(P$coli1 ~ P$ConcGr)
boxplot(P$coli2 ~ P$ConcGr)
boxplot(P$smeg1 ~ P$ConcGr)
boxplot(P$smeg2 ~ P$ConcGr)
op <- par(mfrow=c(1,1), mar=c(5,5,5,2) )
b <- boxplot(df<-cbind(P$coli1/P$coli2, P$coli1/P$smeg1, P$coli1/P$smeg2, P$coli2/P$smeg1, P$coli2/P$smeg2, P$smeg1/P$smeg2, P$ratios),
log='y',
ylab='normalized ratios',
#ylim = c(0, 2),
axes=FALSE,
main=paste("ConcGr = all"))
axis(1, 1:7, c('coli[12]', 'coli1-smeg1', 'coli1-smeg2', 'coli2-smeg1', 'coli2- smeg2','smeg[12]', 'ratio'))
abline(h=1, col='red')
box()
axis(2)
#axis(3, 1:7, b$n)
outliers.idx <- sapply(1:length(b$group),
function(i){
q <- df[, b$group[i]] == b$out[i];
text(b$group[i], b$out[i], P[q, 2], pos=4, cex=0.4);
text(b$group[i], b$out[i], P[q, 1], pos=2, cex=0.4);
which(q)})