Contents

1 Getting started

library(lattice)
library(knitr)
library(parallel)
library(NestLink)

cv <- 1 - 1:7 / 10
t <- trellis.par.get("strip.background")
t$col <- (rgb(cv,cv,cv))
trellis.par.set("strip.background", t)
n.simulation <- 10

2 Load data

# filename <- system.file("extdata/PGexport2_normalizedAgainstSBstandards_Peptides.csv",
#                        package = "NestLink")
# library(ExperimentHub)
# eh <- ExperimentHub()
# filename <- query(eh, c("NestLink", "PGexport2_normalizedAgainstSBstandards_Peptides.csv"))[[1]]

filename <- getExperimentHubFilename("PGexport2_normalizedAgainstSBstandards_Peptides.csv")
P <- read.csv(filename,
              header = TRUE, sep=';')
dim(P)
## [1] 721  27

2.1 Clean

remove modifications

P <- P[P$Modifications == '', ]
dim(P)
## [1] 697  27

select rows

P <- P[,c('Accession', 'Sequence', "X20170919_05_62465_nl5idx1.3_6titratecoli",
          "X20170919_16_62465_nl5idx1.3_6titratecoli", 
          "X20170919_09_62466_nl5idx1.3_7titratesmeg",
          "X20170919_14_62466_nl5idx1.3_7titratesmeg")]
dim(P)
## [1] 697   6

rename column names

names(P)<-c('Accession','Sequence','coli1', 'coli2', 'smeg1', 'smeg2')
dim(P)
## [1] 697   6

remove all rows with invalid identidfier

P<- P[grep("^P[0-9][A-Z][0-9]", P$Accession), ] 

P<-droplevels(P)

add flycode set annotation

P$ConcGr <- NA

P$ConcGr[P$Accession %in% c('P1A4', 'P1B4', 'P1C4', 'P1D4', 'P1E4', 'P1F4')] <- 92

P$ConcGr[P$Accession %in% c('P1A5', 'P1B5', 'P1C5', 'P1D5', 'P1G4', 'P1H4')] <- 295

P$ConcGr[P$Accession %in% c('P1A6', 'P1B6', 'P1E5', 'P1F5', 'P1G5', 'P1H5')] <- 943

P$ConcGr[P$Accession %in% c('P1C6', 'P1D6', 'P1E6', 'P1F6', 'P1G6', 'P1H6')] <- 3017

2.2 Sanity check

table(P$ConcGr)
## 
##   92  295  943 3017 
##   82  122  135  165
Pabs <- P
table(P$Accession)
## 
## P1A4 P1A5 P1A6 P1B4 P1B5 P1B6 P1C4 P1C5 P1C6 P1D4 P1D5 P1D6 P1E4 P1E5 P1E6 P1F4 
##   11   22   24   15   22   23   16   19   26   15   16   29   13   23   30   12 
## P1F5 P1F6 P1G4 P1G5 P1G6 P1H4 P1H5 P1H6 
##   19   28   22   24   24   21   22   28

2.3 Camera ready summary table

P.summary <- aggregate(. ~ ConcGr * Accession, data=P[,c('Accession', 'coli1',
    'coli2', 'smeg1', 'smeg2', 'ConcGr')], FUN=sum)
P.summary$nDetectedFlycodes <- aggregate(Sequence ~ ConcGr * Accession,
    data=na.omit(P), FUN=length)[,3]
P.summary$nTotalFlycodes <- 30
P.summary$coverage <- round(100 * P.summary$nDetectedFlycodes  / P.summary$nTotalFlycodes)
kable(P.summary[order(P.summary$ConcGr),], row.names = FALSE)
ConcGr Accession coli1 coli2 smeg1 smeg2 nDetectedFlycodes nTotalFlycodes coverage
92 P1A4 3473996 3670152 3524234 3629077 11 30 37
92 P1B4 3972002 3951215 3685668 3632472 15 30 50
92 P1C4 5582053 5716917 5700899 5623891 16 30 53
92 P1D4 5188468 5642192 5245252 5281381 15 30 50
92 P1E4 4697745 4824760 5051017 5146804 13 30 43
92 P1F4 3968239 4026336 4246879 4251951 12 30 40
295 P1A5 23970332 24546550 26479936 25950084 22 30 73
295 P1B5 16033017 16352277 17209165 17490691 22 30 73
295 P1C5 24016078 24734329 22810916 22589773 19 30 63
295 P1D5 17658145 17864382 17760463 17773297 16 30 53
295 P1G4 16016015 16618336 18270569 19036154 22 30 73
295 P1H4 19519544 20301903 20111340 20174474 21 30 70
943 P1A6 65538576 67507722 66123186 66400185 24 30 80
943 P1B6 55078431 58048226 50652047 50278919 23 30 77
943 P1E5 60305499 62836186 62173846 61952054 23 30 77
943 P1F5 46800670 49088967 47748401 47930977 19 30 63
943 P1G5 54312219 55033298 51968903 51106582 24 30 80
943 P1H5 59931716 61370897 55971370 56588298 22 30 73
3017 P1C6 168463728 180586807 158342929 158748952 26 30 87
3017 P1D6 159828074 163103139 140519542 138765621 29 30 97
3017 P1E6 166626467 176697478 159850632 161744675 30 30 100
3017 P1F6 191955804 203793127 181359940 182939796 28 30 93
3017 P1G6 183248427 189386707 177588746 178178834 24 30 80
3017 P1H6 181057205 185326019 161715001 152753819 28 30 93
# write.csv(P.summary, file='Figs/FigureS6b.csv')

The following function defines the computation and rendering of the by the paris function called panels.

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)
    if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
    text(0.5, 0.5, txt, cex = cex.cor * r)
}
rv <-lapply(unique(P$ConcGr), function(q){
pairs((P[P$ConcGr == q ,c('coli1', 'coli2', 'smeg1', 'smeg2')]),
      pch=16, col=rgb(0.5,0.5,0.5,alpha = 0.3),
      lower.panel = panel.cor,
      asp=1,
      main=q)
})
## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

2.4 Define FCfill2max

to conduct a fair random experiment we add dummy flycodes to the input data.frame.

FCfill2max <- function(P, n = max(table(P$Accession))){
   for (i in unique(P$Accession)){
     idx <- which(P$Accession == i)
     # determine the number of missing rows for Accession i
     ndiff <- n - length(idx)
     
     if(length(idx) < n){
       cand <- P[idx[1], ]  
       cand[,2 ] <- "xxx"
       cand[,3:6 ] <- NA
       
       for (j in 1:ndiff){
         P <- rbind(P, cand)
       }
     }
   }
  P
}

2.5 Plot compute CVs for each row

P.mean <- apply(P[, c('coli1', 'coli2', 'smeg1', 'smeg2')],1, FUN=mean)
P.sd <- apply(P[, c('coli1', 'coli2', 'smeg1', 'smeg2')],1, FUN=sd)

boxplot(100*P.sd/P.mean ~ P$ConcGr,log='y', ylab='CV%')

P <- FCfill2max(P)

3 Absolute Quantification

3.1 Define function FlycodeAbsoluteStatistics using coli1 column

FlycodeAbsoluteStatistics <- function(P){
  
  PP <- aggregate(P$coli1 ~ P$Accession + P$ConcGr, FUN=sum)
  
  names(PP) <- c('Accession', 'ConcGr', 'coli1_sum')
  PPP <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=mean)

  P.mean <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=mean)
  P.sd <- aggregate(PP$coli1_sum ~ PP$ConcGr, FUN=sd)
  P.cv <- aggregate(PP$coli1_sum ~ PP$ConcGr,
      FUN = function(x){ 100 * sd(x) / mean(x) })
  P.length <- max(aggregate(P$coli1 ~ P$Accession, FUN=length)[,2])
 
 rv <- data.frame(ConcGr=P.sd[,1],
                  mean=P.mean[,2],
                  sd=P.sd[,2],
                  cv=round(P.cv[,2],2))
 rv$nFCAccession <-  P.length
 rv
}

3.2 Camera ready absolute table

kable(FlycodeAbsoluteStatistics(P))
ConcGr mean sd cv nFCAccession
92 4480417 811894.8 18.12 30
295 19535522 3685706.9 18.87 30
943 56994519 6440051.4 11.30 30
3017 175196618 12124519.7 6.92 30

3.3 Define FCrandom

# TODO(cp); make it work for n = 0
FCrandom <- function(P, n = 1){
  if(n == 0){
    return (P)
  }
  for (i in unique(P$Accession)){
    idx <- which(P$Accession == i)
    stopifnot(length(idx) >= n)
    smp <- sample(length(idx), n)
    P <- P[-idx[smp],]
  }
  P$n <- n
  P
}

3.4 Conduct the random experiment

set.seed(8)
S <- do.call('rbind', lapply(1:29, function(i){
  FlycodeAbsoluteStatistics(FCrandom(P, i))
  }))
xyplot(cv ~ nFCAccession | ConcGr, 
       data = S, 
       layout = c(4, 1),
       strip = strip.custom(strip.names = TRUE, strip.levels = TRUE)
       )

set.seed(1)

S.rep <- do.call('rbind', 
    lapply(1:n.simulation, function(s){
       S <- do.call('rbind', 
           lapply(1:29, function(i){
                FlycodeAbsoluteStatistics(FCrandom(P, i))
             }))
       }))

3.5 Camera ready plot of absolute flycode simulation

NestLink_absolute_flycode_simulation <- xyplot(cv ~ nFCAccession |  ConcGr,
       data = S.rep,
       panel = function(x,y,...){
         panel.abline(h=c(10, 50), col='red')
         panel.xyplot(x, y, ...)
         xy.median <- (aggregate(y, by=list(x), FUN=median, na.rm = TRUE))
         xy.quantile <- aggregate(y, by=list(x), FUN=function(d){quantile(d, c(0.05, 0.5, 0.95), na.rm = TRUE)})
         panel.points( xy.median[,1], xy.median[,2], pch=16, cex=0.5)
         # print((xy.quantile[,2])[,3])
         panel.points( xy.quantile[,1],(xy.quantile[,2])[,1], pch='-')
         panel.points( xy.quantile[,1],(xy.quantile[,2])[,3], pch='-')
       },
       xlab= 'Number of flycodes',
       ylab ='CV [%]',
       strip = strip.custom(strip.names = TRUE, strip.levels = TRUE),
       scales=list(x=list(at=c(1,5,10,15,20,25,30)),
                   y=list(at=c(0,10,50,100,150,200))),
       ylim=c(0,175),
       
       pch=16,
       col=rgb(0.5, 0.5, 0.5, alpha = 0.01),
       layout = c(4,1))

print(NestLink_absolute_flycode_simulation)

# png("NestLink_absolute_flycode_simulation.png", 1200,800)
# print(NestLink_absolute_flycode_simulation)
# dev.off()

4 Relative Quantification

4.1 Normalization

P <- na.omit(P)
P <- P[P$coli1 >0,]


P$coli1 <- P$coli1 / sum(P$coli1)
P$coli2 <- P$coli2 / sum(P$coli2)
P$smeg1 <- P$smeg1 / sum(P$smeg1)
P$smeg2 <- P$smeg2 / sum(P$smeg2)

sanity check

sum(P$coli1)
## [1] 1
sum(P$coli2)
## [1] 1
sum(P$smeg1)
## [1] 1
sum(P$smeg2)
## [1] 1

4.2 Define ratios

P$ratios <- (0.5* (P$coli1 + P$coli2)) / (0.5 * (P$smeg1 + P$smeg2))
b <- boxplot(P$coli1 / P$coli2, P$coli1 / P$smeg1, P$coli1 / P$smeg2,P$coli2 / P$smeg1, P$coli2 / P$smeg2 , P$smeg1 / P$smeg2, P$ratios, 
        ylab='ratios', ylim = c(0,2 ))
axis(1, 1:6, c('coli[12]', 'coli1-smeg1', 'coli1-smeg2', 'coli2-smeg1', 'coli2- smeg2','smeg[12]'))

op <- par(mfrow = c(1, 4))
boxplot(P$coli1 ~ P$ConcGr)
boxplot(P$coli2 ~ P$ConcGr)
boxplot(P$smeg1 ~ P$ConcGr)
boxplot(P$smeg2 ~ P$ConcGr)

4.3 Determine outliers (removal)

op <- par(mfrow=c(1,1), mar=c(5,5,5,2) )

b <- boxplot(df<-cbind(P$coli1/P$coli2, P$coli1/P$smeg1, P$coli1/P$smeg2, P$coli2/P$smeg1, P$coli2/P$smeg2, P$smeg1/P$smeg2, P$ratios),
             log='y',
        ylab='normalized ratios',
        #ylim = c(0, 2),
        axes=FALSE,
        main=paste("ConcGr = all"))
axis(1, 1:7, c('coli[12]', 'coli1-smeg1', 'coli1-smeg2', 'coli2-smeg1', 'coli2- smeg2','smeg[12]', 'ratio'))
abline(h=1, col='red')
box()
axis(2)
#axis(3, 1:7, b$n)
outliers.idx <- sapply(1:length(b$group),
    function(i){
      q <- df[, b$group[i]] == b$out[i];
      text(b$group[i], b$out[i], P[q, 2], pos=4, cex=0.4);
      text(b$group[i], b$out[i], P[q, 1], pos=2, cex=0.4);
      which(q)})