!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw
.
## Loading required package: DBI
## Loading required package: protViz
## Loading required package: RSQLite
## Loading required package: seqinr
quantile
# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
newfrom <- NULL
for(from in windfrom){
idx <- which.min(abs(from - empty))
startmass <- 0
if(isfrom){
if(empty[idx] < from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx-1]
}
}else{
if(empty[idx] > from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx+1]
}
}
newfrom <- c(newfrom, round(startmass,digits=1))
}
return(newfrom)
}
.cdsw_compute_breaks <-
function(xx, nbins){
q <- quantile(xx, seq(0, 1, length = nbins + 1))
q[1] <- q[1] - 0.5
q[length(q)] <- q[length(q)] + 0.5
q <- round(q)
}
cdsw <-
function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
if (class(x) == "psmSet") {
x <- unlist(lapply(x, function(x) {
x$pepmass
}))
} else if (class(x) == 'specLSet') {
x <- unlist(lapply(x@ionlibrary, function(xx) {
xx@q1
}))
}
# x should be numeric
if (class(x) != "numeric") {
warning("can not compute quantils. 'x' is not numeric.")
return (NULL)
}
x <- x[x > massrange[1] & x < massrange[2]]
q <- FUN(xx=x, nbins=n)
idx <- 1:n
from <- q[idx] - overlap * 0.5
to <- q[idx + 1] + overlap * 0.5
width <- 0.5 * (to - from)
mid <- from + width
h <- hist(x, breaks = q, ...)
data.frame(from, to, mid, width, counts = h$counts)
}
cdsw(exampledata,
freq=TRUE,
overlap = 0,
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## from to mid width counts
## 0% 301 384 342.5 41.5 586
## 5% 384 420 402.0 18.0 591
## 10% 420 449 434.5 14.5 583
## 15% 449 476 462.5 13.5 599
## 20% 476 500 488.0 12.0 565
## 25% 500 524 512.0 12.0 604
## 30% 524 547 535.5 11.5 566
## 35% 547 572 559.5 12.5 598
## 40% 572 598 585.0 13.0 591
## 45% 598 625 611.5 13.5 577
## 50% 625 651 638.0 13.0 603
## 55% 651 682 666.5 15.5 575
## 60% 682 713 697.5 15.5 594
## 65% 713 745 729.0 16.0 574
## 70% 745 783 764.0 19.0 595
## 75% 783 825 804.0 21.0 572
## 80% 825 881 853.0 28.0 592
## 85% 881 948 914.5 33.5 583
## 90% 948 1045 996.5 48.5 592
## 95% 1045 1250 1147.5 102.5 586
.cdsw_objective <- function(splits, data){
counts <- hist(data, breaks=splits,plot=FALSE)$counts
nbins<-length(splits)-1
optimumN <- length(data)/(length(splits)-1)
optimumN<-rep(optimumN,nbins)
score2 <-sqrt(sum((counts - optimumN)^2))
score1 <- sum(abs(counts - round(optimumN)))
return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
difsp<-diff(splits)
return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}
.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
breaks <- nbins+1
#xx <- x
#xx<-xx[xx >=310 & xx<1250]
# TODO(wew): there is something insconsitent with the nbins parameter
qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
plot(1:breaks, qqs, type="b" ,
sub=".cdsw_compute_sampling_breaks")
legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
paste("nbins = ", breaks) ))
# equidistant spaced bins
unif <- seq(min(xx),max(xx),length=(breaks))
lines(1:breaks,unif,col=2,type="b")
if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
}else{
.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
}
mixeddata <- xx
it_count <- 0
error <- 0
while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
it_count <- it_count + 1
uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
mixeddata<-c(mixeddata,uniformdata)
qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
lines(1:breaks,qqs,type="l", col="#00DD00AA")
error[it_count] <-.cdsw_objective(qqs, xx)$score1
}
lines(1:breaks,qqs,type="b", col="#FF1111AA")
plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
qqs <- as.numeric(sort(round(qqs)))
qqs[1] <- qqs[1] - 0.5
qqs[length(qqs)] <- qqs[length(qqs)] + 0.5
round(qqs, 1)
}
op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
n=35,
massrange = c(350,1250),
sub='sampling based',
main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
res <- hist(ms1data, breaks=breaks)
abline(v=wind$from,col=2,lty=2)
abline(v=wind$to,col=3,lty=2)
empty <- res$mids[which(res$counts < maxbin )]
newfrom <- .makenewfromto(wind$from , empty)
newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
plot(res,xlim=c(1060,1065))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
plot(res,xlim=c(520,550))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
width <- (newto - newfrom) * 0.5
mid <- (newfrom + newto)*0.5
newCounts <- NULL
for(i in 1:length(newfrom))
{
newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
}
data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)
## newfrom newto mid width counts
## 1 349.5 378.1 363.80 14.30 293
## 2 378.0 401.1 389.55 11.55 335
## 3 401.0 422.1 411.55 10.55 365
## 4 422.0 442.1 432.05 10.05 417
## 5 442.0 461.1 451.55 9.55 382
## 6 461.0 479.1 470.05 9.05 424
## 7 479.0 496.1 487.55 8.55 414
## 8 496.0 513.0 504.50 8.50 423
## 9 513.0 531.0 522.00 9.00 437
## 10 530.9 548.0 539.45 8.55 428
## 11 547.9 564.0 555.95 8.05 390
## 12 564.0 582.0 573.00 9.00 419
## 13 582.0 601.0 591.50 9.50 428
## 14 601.0 621.0 611.00 10.00 426
## 15 621.0 639.0 630.00 9.00 410
## 16 639.0 656.0 647.50 8.50 383
## 17 656.0 679.0 667.50 11.50 395
## 18 679.0 696.0 687.50 8.50 369
## 19 696.0 718.0 707.00 11.00 393
## 20 718.0 740.0 729.00 11.00 395
## 21 740.0 760.0 750.00 10.00 362
## 22 760.0 786.0 773.00 13.00 368
## 23 785.9 811.0 798.45 12.55 344
## 24 811.0 836.0 823.50 12.50 317
## 25 836.0 865.0 850.50 14.50 301
## 26 865.0 898.0 881.50 16.50 307
## 27 898.0 929.1 913.55 15.55 273
## 28 929.0 959.1 944.05 15.05 255
## 29 959.0 994.1 976.55 17.55 215
## 30 994.0 1031.0 1012.50 18.50 209
## 31 1031.0 1068.0 1049.50 18.50 188
## 32 1068.0 1107.0 1087.50 19.50 177
## 33 1107.0 1151.0 1129.00 22.00 122
## 34 1151.0 1200.0 1175.50 24.50 101
## 35 1200.0 1249.5 1224.75 24.75 71
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=35,
overlap = 0,
sub='quantile based',
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
## from to mid width counts
## 0% 301 363 332.0 31.0 332
## 2.857143% 363 390 376.5 13.5 343
## 5.714286% 390 410 400.0 10.0 328
## 8.571429% 410 429 419.5 9.5 331
## 11.42857% 429 445 437.0 8.0 339
## 14.28571% 445 462 453.5 8.5 347
## 17.14286% 462 476 469.0 7.0 339
## 20% 476 489 482.5 6.5 315
## 22.85714% 489 503 496.0 7.0 333
## 25.71429% 503 517 510.0 7.0 353
## 28.57143% 517 531 524.0 7.0 338
## 31.42857% 531 544 537.5 6.5 316
## 34.28571% 544 557 550.5 6.5 337
## 37.14286% 557 572 564.5 7.5 341
## 40% 572 586 579.0 7.0 337
## 42.85714% 586 601 593.5 7.5 324
## 45.71429% 601 617 609.0 8.0 350
## 48.57143% 617 632 624.5 7.5 332
## 51.42857% 632 646 639.0 7.0 321
## 54.28571% 646 663 654.5 8.5 342
## 57.14286% 663 682 672.5 9.5 340
## 60% 682 698 690.0 8.0 336
## 62.85714% 698 716 707.0 9.0 323
## 65.71429% 716 736 726.0 10.0 350
## 68.57143% 736 754 745.0 9.0 328
## 71.42857% 754 776 765.0 11.0 335
## 74.28571% 776 800 788.0 12.0 336
## 77.14286% 800 825 812.5 12.5 327
## 80% 825 855 840.0 15.0 344
## 82.85714% 855 891 873.0 18.0 330
## 85.71429% 891 928 909.5 18.5 334
## 88.57143% 928 969 948.5 20.5 340
## 91.42857% 969 1029 999.0 30.0 337
## 94.28571% 1029 1097 1063.0 34.0 332
## 97.14286% 1097 1250 1173.5 76.5 336
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
main = paste("max window size", mws), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = mws)
})
})
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=nbins,
overlap = 0,
main = paste("nr bins", nbins), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = 100)
})
})
Here is the output of sessionInfo()
on the system on which this document was compiled:
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server 2012 R2 x64 (build 9600)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=C
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] specL_1.10.0 seqinr_3.3-6 RSQLite_1.1-2 protViz_0.2.28
## [5] DBI_0.6-1 BiocStyle_2.4.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 digest_0.6.12 rprojroot_1.2 backports_1.0.5
## [5] magrittr_1.5 evaluate_0.10 stringi_1.1.5 rmarkdown_1.4
## [9] tools_3.4.0 ade4_1.7-6 stringr_1.2.0 yaml_2.1.14
## [13] compiler_3.4.0 memoise_1.1.0 htmltools_0.3.5 knitr_1.15.1