## <script type="text/javascript">
## document.addEventListener("DOMContentLoaded", function() {
##   document.querySelector("h1").style.marginTop = "0";
## });
## </script>
## <script type="text/javascript">
## document.addEventListener("DOMContentLoaded", function() {
##   var links = document.links;  
##   for (var i = 0, linksLength = links.length; i < linksLength; i++)
##     if (links[i].hostname != window.location.hostname)
##       links[i].target = '_blank';
## });
## </script>

Mapping proteins to their genomic coordinates

Laurent Gatto

Introduction

The aim of this document is to document to mapping of proteins and the tandem mass spectrometry derived peptides they have been inferred from to genomic locations.

Mapping proteins to a genome reference.

Proteins and genome data

We will use a small object from Pbase to illustrate how to retrieve genome coordinates and map a protein back to genomic coordinates. In the remainder of this vignette, we will concentrate on a spectific protein, namely P00558.

library("Pbase")
data(p)
p
## S4 class type     : Proteins
## Class version     : 0.1
## Created           : Wed Jul 16 02:58:33 2014
## Number of Proteins: 9
## Sequences:
##   [1] A4UGR9 [2] A6H8Y1 ... [8] P04075-2 [9] P60709
## Sequence features:
##   [1] DB [2] AccessionNumber ... [10] Comment [11] Filename
## Peptide features:
##   [1] DB [2] AccessionNumber ... [46] spectrumFile [47] databaseFile
seqnames(p)
## [1] "A4UGR9"   "A6H8Y1"   "O43707"   "O75369"   "P00558"   "P02545"  
## [7] "P04075"   "P04075-2" "P60709"
dat <- data.frame(i = 1:length(p),
                  npeps = elementLengths(pfeatures(p)),
                  protln = width(aa(p)))
dat
##          i npeps protln
## A4UGR9   1    36   3374
## A6H8Y1   2    23   2624
## O43707   3     6    911
## O75369   4    13   2602
## P00558   5     5    417
## P02545   6    12    664
## P04075   7    21    364
## P04075-2 8    20    418
## P60709   9     1    375
sp <- seqnames(p)[5]

Querying with biomaRt

The input information consists of a UniProt identifier and a set of peptides positions along the protein. The first step is to map the protein accession number to a gene identifier (here, we will use Ensembl) and to obtain genome coordinates.

Multiple solutiona are offered to use. Below, we use the biomaRt Bioconductor package.

library("biomaRt")
ens <- useMart("ensembl", "hsapiens_gene_ensembl")

bm <- select(ens, keys = sp,
             keytype = "uniprot_swissprot_accession",
             columns = c(
                 "ensembl_gene_id",
                 "ensembl_transcript_id",
                 "ensembl_peptide_id",
                 "ensembl_exon_id",  
                 "description",     
                 "chromosome_name",
                 "strand",
                 "exon_chrom_start",
                 "exon_chrom_end",
                 "5_utr_start",
                 "5_utr_end",
                 "3_utr_start",
                 "3_utr_end",
                 "start_position",     
                 "end_position"))

bm
##    ensembl_gene_id ensembl_transcript_id ensembl_peptide_id
## 1  ENSG00000102144       ENST00000373316    ENSP00000362413
## 2  ENSG00000102144       ENST00000373316    ENSP00000362413
## 3  ENSG00000102144       ENST00000373316    ENSP00000362413
## 4  ENSG00000102144       ENST00000373316    ENSP00000362413
## 5  ENSG00000102144       ENST00000373316    ENSP00000362413
## 6  ENSG00000102144       ENST00000373316    ENSP00000362413
## 7  ENSG00000102144       ENST00000373316    ENSP00000362413
## 8  ENSG00000102144       ENST00000373316    ENSP00000362413
## 9  ENSG00000102144       ENST00000373316    ENSP00000362413
## 10 ENSG00000102144       ENST00000373316    ENSP00000362413
## 11 ENSG00000102144       ENST00000373316    ENSP00000362413
## 12 ENSG00000269666       ENST00000597340    ENSP00000472461
## 13 ENSG00000269666       ENST00000597340    ENSP00000472461
## 14 ENSG00000269666       ENST00000597340    ENSP00000472461
## 15 ENSG00000269666       ENST00000597340    ENSP00000472461
## 16 ENSG00000269666       ENST00000597340    ENSP00000472461
## 17 ENSG00000269666       ENST00000597340    ENSP00000472461
## 18 ENSG00000269666       ENST00000597340    ENSP00000472461
## 19 ENSG00000269666       ENST00000597340    ENSP00000472461
## 20 ENSG00000269666       ENST00000597340    ENSP00000472461
## 21 ENSG00000269666       ENST00000597340    ENSP00000472461
## 22 ENSG00000269666       ENST00000597340    ENSP00000472461
##    ensembl_exon_id                                             description
## 1  ENSE00001600900 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 2  ENSE00003506377 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 3  ENSE00003502842 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 4  ENSE00003512377 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 5  ENSE00003581136 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 6  ENSE00003597777 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 7  ENSE00000672997 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 8  ENSE00000672996 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 9  ENSE00000672995 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 10 ENSE00000672994 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 11 ENSE00001948816 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 12 ENSE00003177038 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 13 ENSE00003462979 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 14 ENSE00003487079 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 15 ENSE00003549531 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 16 ENSE00003639123 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 17 ENSE00003515683 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 18 ENSE00003045537 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 19 ENSE00003016554 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 20 ENSE00003003175 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 21 ENSE00003138826 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 22 ENSE00003081515 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
##    chromosome_name strand exon_chrom_start exon_chrom_end 5_utr_start
## 1                X      1         77359671       77359902    77359671
## 2                X      1         77365364       77365414          NA
## 3                X      1         77369241       77369396          NA
## 4                X      1         77369513       77369657          NA
## 5                X      1         77372809       77372912          NA
## 6                X      1         77373548       77373667          NA
## 7                X      1         77378332       77378446          NA
## 8                X      1         77378692       77378871          NA
## 9                X      1         77380371       77380548          NA
## 10               X      1         77380824       77380922          NA
## 11               X      1         77381287       77384793          NA
## 12    HG1426_PATCH      1         77365128       77365359    77365128
## 13    HG1426_PATCH      1         77370821       77370871          NA
## 14    HG1426_PATCH      1         77374698       77374853          NA
## 15    HG1426_PATCH      1         77374970       77375114          NA
## 16    HG1426_PATCH      1         77378266       77378369          NA
## 17    HG1426_PATCH      1         77379005       77379124          NA
## 18    HG1426_PATCH      1         77383789       77383903          NA
## 19    HG1426_PATCH      1         77384149       77384328          NA
## 20    HG1426_PATCH      1         77385828       77386005          NA
## 21    HG1426_PATCH      1         77386281       77386379          NA
## 22    HG1426_PATCH      1         77386744       77390250          NA
##    5_utr_end 3_utr_start 3_utr_end start_position end_position
## 1   77359837          NA        NA       77320685     77384793
## 2         NA          NA        NA       77320685     77384793
## 3         NA          NA        NA       77320685     77384793
## 4         NA          NA        NA       77320685     77384793
## 5         NA          NA        NA       77320685     77384793
## 6         NA          NA        NA       77320685     77384793
## 7         NA          NA        NA       77320685     77384793
## 8         NA          NA        NA       77320685     77384793
## 9         NA          NA        NA       77320685     77384793
## 10        NA          NA        NA       77320685     77384793
## 11        NA    77381328  77384793       77320685     77384793
## 12  77365294          NA        NA       77326142     77390250
## 13        NA          NA        NA       77326142     77390250
## 14        NA          NA        NA       77326142     77390250
## 15        NA          NA        NA       77326142     77390250
## 16        NA          NA        NA       77326142     77390250
## 17        NA          NA        NA       77326142     77390250
## 18        NA          NA        NA       77326142     77390250
## 19        NA          NA        NA       77326142     77390250
## 20        NA          NA        NA       77326142     77390250
## 21        NA          NA        NA       77326142     77390250
## 22        NA    77386785  77390250       77326142     77390250

We will only consider the gene located on the X chromosome and ignore the sequence patch and conveniently store the transcript and protein identifiers for later use.

bm <- bm[bm$chromosome_name == "X", ]
tr <- unique(bm$ensembl_transcript_id)
pr <- unique(bm$ensembl_peptide_id)

The information retrieved gives us various information, in particular the Ensembl gene identifer ENSG00000102144 and its starting and ending position 77320685 and 77384793 on chromosome X.

Genome visualisation with Gviz

While the above defines all necessary genomic coordinates, and as we will make use of the Gviz plotting infrastructure, it is more straightforward to fetch the above information via biomaRt using specialised Gviz infrastructure.

library("Gviz")
options(ucscChromosomeNames=FALSE)

bmTrack <- BiomartGeneRegionTrack(start = min(bm$start_position),
                                  end = max(bm$end_position),
                                  chromosome = "chrX",
                                  genome = "hg19")

## see also the filters param and getBM to
## filter on biotype == "protein_coding"

bmTracks <- split(bmTrack, transcript(bmTrack))

grTrack <- bmTracks[[which(names(bmTracks) == tr)]]
names(grTrack) <- tr

We can convince ourselves that the manual retrieved data in bm and the data in the BiomartRegionTrack subsequently subset into a GeneRegionTrack match.

as(grTrack, "data.frame")
##    X.seqnames  X.start    X.end X.width X.strand      X.feature
## 1           X 77359671 77359837     167        +           utr5
## 2           X 77359838 77359902      65        + protein_coding
## 3           X 77365364 77365414      51        + protein_coding
## 4           X 77369241 77369396     156        + protein_coding
## 5           X 77369513 77369657     145        + protein_coding
## 6           X 77372809 77372912     104        + protein_coding
## 7           X 77373548 77373667     120        + protein_coding
## 8           X 77378332 77378446     115        + protein_coding
## 9           X 77378692 77378871     180        + protein_coding
## 10          X 77380371 77380548     178        + protein_coding
## 11          X 77380824 77380922      99        + protein_coding
## 12          X 77381287 77381327      41        + protein_coding
## 13          X 77381328 77384793    3466        +           utr3
##             X.gene          X.exon    X.transcript X.symbol X.rank X.phase
## 1  ENSG00000102144 ENSE00001600900 ENST00000373316     PGK1      1      -1
## 2  ENSG00000102144 ENSE00001600900 ENST00000373316     PGK1      1      -1
## 3  ENSG00000102144 ENSE00003506377 ENST00000373316     PGK1      2       2
## 4  ENSG00000102144 ENSE00003502842 ENST00000373316     PGK1      3       2
## 5  ENSG00000102144 ENSE00003512377 ENST00000373316     PGK1      4       2
## 6  ENSG00000102144 ENSE00003581136 ENST00000373316     PGK1      5       0
## 7  ENSG00000102144 ENSE00003597777 ENST00000373316     PGK1      6       2
## 8  ENSG00000102144 ENSE00000672997 ENST00000373316     PGK1      7       2
## 9  ENSG00000102144 ENSE00000672996 ENST00000373316     PGK1      8       0
## 10 ENSG00000102144 ENSE00000672995 ENST00000373316     PGK1      9       0
## 11 ENSG00000102144 ENSE00000672994 ENST00000373316     PGK1     10       1
## 12 ENSG00000102144 ENSE00001948816 ENST00000373316     PGK1     11       0
## 13 ENSG00000102144 ENSE00001948816 ENST00000373316     PGK1     11      -1
##           feature            gene            exon      transcript symbol
## 1            utr5 ENSG00000102144 ENSE00001600900 ENST00000373316   PGK1
## 2  protein_coding ENSG00000102144 ENSE00001600900 ENST00000373316   PGK1
## 3  protein_coding ENSG00000102144 ENSE00003506377 ENST00000373316   PGK1
## 4  protein_coding ENSG00000102144 ENSE00003502842 ENST00000373316   PGK1
## 5  protein_coding ENSG00000102144 ENSE00003512377 ENST00000373316   PGK1
## 6  protein_coding ENSG00000102144 ENSE00003581136 ENST00000373316   PGK1
## 7  protein_coding ENSG00000102144 ENSE00003597777 ENST00000373316   PGK1
## 8  protein_coding ENSG00000102144 ENSE00000672997 ENST00000373316   PGK1
## 9  protein_coding ENSG00000102144 ENSE00000672996 ENST00000373316   PGK1
## 10 protein_coding ENSG00000102144 ENSE00000672995 ENST00000373316   PGK1
## 11 protein_coding ENSG00000102144 ENSE00000672994 ENST00000373316   PGK1
## 12 protein_coding ENSG00000102144 ENSE00001948816 ENST00000373316   PGK1
## 13           utr3 ENSG00000102144 ENSE00001948816 ENST00000373316   PGK1
##    rank phase
## 1     1    -1
## 2     1    -1
## 3     2     2
## 4     3     2
## 5     4     2
## 6     5     0
## 7     6     2
## 8     7     2
## 9     8     0
## 10    9     0
## 11   10     1
## 12   11     0
## 13   11    -1

Below, we create additional ideogram and axis tracks and produce a visualisation of our genomic coordinates of interest.

ideoTrack <- IdeogramTrack(genome = "hg19",
                           chromosome = "chrX")
axisTrack <- GenomeAxisTrack()
seqlevels(ranges(grTrack)) <- 
    chromosome(grTrack) <-
        chromosome(ideoTrack)
plotTracks(list(ideoTrack, axisTrack, grTrack),
           add53 = TRUE, add35 = TRUE)

plot of chunk gvizfig

Using TranscriptDb instances

Finally, on could also use TranscriptDb objects to create the GeneRegionTrack. Below, we use the UCSC annotation (which is directly available from Bioconductor; Ensembl TranscriptDb instances could be generated manually - TODO document this) to extract our regions of interest.

library("TxDb.Hsapiens.UCSC.hg19.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txTr <- GeneRegionTrack(txdb, chromosome = "chrX",
                        start = bm$start_position[1],
                        end = bm$end_position[1])

head(as(txTr, "data.frame"))
##   X.seqnames  X.start    X.end X.width X.strand X.feature    X.id
## 1       chrX 77359666 77359837     172        +      utr5 unknown
## 2       chrX 77359838 77359902      65        +       CDS unknown
## 3       chrX 77361859 77362168     310        +      utr5 unknown
## 4       chrX 77365364 77365382      19        +      utr5 unknown
## 5       chrX 77365364 77365414      51        +       CDS unknown
## 6       chrX 77365383 77365414      32        +       CDS unknown
##         X.exon X.transcript X.gene   X.symbol X.density feature      id
## 1 uc004ecz.4_1   uc004ecz.4   5230 uc004ecz.4         1    utr5 unknown
## 2 uc004ecz.4_1   uc004ecz.4   5230 uc004ecz.4         1     CDS unknown
## 3 uc011mqq.2_1   uc011mqq.2   5230 uc011mqq.2         1    utr5 unknown
## 4 uc011mqq.2_2   uc011mqq.2   5230 uc011mqq.2         1    utr5 unknown
## 5 uc004ecz.4_2   uc004ecz.4   5230 uc004ecz.4         1     CDS unknown
## 6 uc011mqq.2_2   uc011mqq.2   5230 uc011mqq.2         1     CDS unknown
##           exon transcript gene     symbol density
## 1 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4       1
## 2 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4       1
## 3 uc011mqq.2_1 uc011mqq.2 5230 uc011mqq.2       1
## 4 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2       1
## 5 uc004ecz.4_2 uc004ecz.4 5230 uc004ecz.4       1
## 6 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2       1

Peptide features

The peptides that have been experimentally observed are available as ranges (coordinates) along the protein sequences. For example, below, we see 5 peptides (ELNYFAKALESPER, DLMSKAEK, QIVWNGPVGVFEWEAFAR, FHVEEEGKGKDASGNK, GTKALMDEVVK) have been identified for our protein of interest P00558.

pp <- p[sp]
pranges(pp)
## IRangesList of length 1
## $P00558
## IRanges of length 5
##     start end width  names
## [1]   193 206    14 P00558
## [2]   268 275     8 P00558
## [3]   333 350    18 P00558
## [4]   124 139    16 P00558
## [5]   351 361    11 P00558
plot(pp)

plot of chunk pepsfig

The aim of this document is to document the mapping of peptides, i.e. ranges along a protein sequence to ranges along the genome reference. In other words, our aim is the convert protein coordinates to genome coordinates.

Mapping peptides back to the genome

Additional features of interest (in our case peptides) can be added to the genomic visualisation using dedicated annotation tracks, that can be constructed as shown below.

Comparing protein and translated DNA sequences

The first check that we want to implement is to verify that we can regenerate the protein amino acid sequence from the genome regions that we have extracted. We start by subsetting only the actual protein coding regions, i.e ignoring the 5' and 3' untranslated regions of our Genome Region track.

(prng <- ranges(grTrack[feature(grTrack) == "protein_coding",]))
## GRanges object with 11 ranges and 7 metadata columns:
##        seqnames               ranges strand   |        feature
##           <Rle>            <IRanges>  <Rle>   |    <character>
##    [1]     chrX [77359838, 77359902]      +   | protein_coding
##    [2]     chrX [77365364, 77365414]      +   | protein_coding
##    [3]     chrX [77369241, 77369396]      +   | protein_coding
##    [4]     chrX [77369513, 77369657]      +   | protein_coding
##    [5]     chrX [77372809, 77372912]      +   | protein_coding
##    ...      ...                  ...    ... ...            ...
##    [7]     chrX [77378332, 77378446]      +   | protein_coding
##    [8]     chrX [77378692, 77378871]      +   | protein_coding
##    [9]     chrX [77380371, 77380548]      +   | protein_coding
##   [10]     chrX [77380824, 77380922]      +   | protein_coding
##   [11]     chrX [77381287, 77381327]      +   | protein_coding
##                   gene            exon      transcript      symbol
##            <character>     <character>     <character> <character>
##    [1] ENSG00000102144 ENSE00001600900 ENST00000373316        PGK1
##    [2] ENSG00000102144 ENSE00003506377 ENST00000373316        PGK1
##    [3] ENSG00000102144 ENSE00003502842 ENST00000373316        PGK1
##    [4] ENSG00000102144 ENSE00003512377 ENST00000373316        PGK1
##    [5] ENSG00000102144 ENSE00003581136 ENST00000373316        PGK1
##    ...             ...             ...             ...         ...
##    [7] ENSG00000102144 ENSE00000672997 ENST00000373316        PGK1
##    [8] ENSG00000102144 ENSE00000672996 ENST00000373316        PGK1
##    [9] ENSG00000102144 ENSE00000672995 ENST00000373316        PGK1
##   [10] ENSG00000102144 ENSE00000672994 ENST00000373316        PGK1
##   [11] ENSG00000102144 ENSE00001948816 ENST00000373316        PGK1
##             rank     phase
##        <numeric> <integer>
##    [1]         1        -1
##    [2]         2         2
##    [3]         3         2
##    [4]         4         2
##    [5]         5         0
##    ...       ...       ...
##    [7]         7         2
##    [8]         8         0
##    [9]         9         0
##   [10]        10         1
##   [11]        11         0
##   -------
##   seqinfo: 1 sequence from hg19 genome; no seqlengths

We also need the actual genome sequence (so far, we have only dealt with regions and features). There are multiple ways to obtain genome sequences with R and Bioconductor. As we have been working with the Ensembl reference genome, we could simply download the whole genome of only the chromosome X and load it as an AAStringSet and extract the protein coding regions of interest at the coordinates defined by the ranges in prng.

library("Biostrings")
chrx <- readDNAStringSet("./Homo_sapiens.GRCh37.75.dna.chromosome.X.fa.gz")
seq <- extractAt(chrx[[1]], ranges(prng))
pptr <- translate(unlist(seq))

It is also possible to use readily package genomes to do this. Above, we have use the TxDb.Hsapiens.UCSC.hg19.knownGene package defining transcripts. There is a similar package for the human UCSC genome build, namely BSgenome.Hsapiens.UCSC.hg19. However, as we have focused on Ensembl data annotation, we will first need to convert our Ensembl transcript (or protein) identifier ENST00000373316 (ENSP00000362413) into the UCSC equivalent. (TODO Use also Homo.sapiens annotation package.)

ucsc <- select(ens, key = pr,
               keytype = "ensembl_peptide_id",
               columns = "ucsc")
ucsc
##         ucsc
## 1 uc004ecz.4
library("BSgenome.Hsapiens.UCSC.hg19")
prng2 <- ranges(txTr[transcript(txTr) %in% ucsc])
prng2 <- prng2[prng2$feature == "CDS"]

seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, prng2)
pptr <- translate(unlist(seq))
## remove stop codon
pptr <- pptr[1:417]
writePairwiseAlignments(pairwiseAlignment(pp[[1]], pptr))
## ########################################
## # Program: Biostrings (version 2.34.0), a Bioconductor package
## # Rundate: Mon Oct 13 21:06:35 2014
## ########################################
## #=======================================
## #
## # Aligned_sequences: 2
## # 1: P1
## # 2: S1
## # Matrix: NA
## # Gap_penalty: 14.0
## # Extend_penalty: 4.0
## #
## # Length: 417
## # Identity:     417/417 (100.0%)
## # Similarity:    NA/417 (NA%)
## # Gaps:           0/417 (0.0%)
## # Score: 1794.636
## #
## #
## #=======================================
## 
## P1                 1 MSLSNKLTLDKLDVKGKRVVMRVDFNVPMKNNQITNNQRIKAAVPSIKFC     50
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1                 1 MSLSNKLTLDKLDVKGKRVVMRVDFNVPMKNNQITNNQRIKAAVPSIKFC     50
## 
## P1                51 LDNGAKSVVLMSHLGRPDGVPMPDKYSLEPVAVELKSLLGKDVLFLKDCV    100
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1                51 LDNGAKSVVLMSHLGRPDGVPMPDKYSLEPVAVELKSLLGKDVLFLKDCV    100
## 
## P1               101 GPEVEKACANPAAGSVILLENLRFHVEEEGKGKDASGNKVKAEPAKIEAF    150
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               101 GPEVEKACANPAAGSVILLENLRFHVEEEGKGKDASGNKVKAEPAKIEAF    150
## 
## P1               151 RASLSKLGDVYVNDAFGTAHRAHSSMVGVNLPQKAGGFLMKKELNYFAKA    200
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               151 RASLSKLGDVYVNDAFGTAHRAHSSMVGVNLPQKAGGFLMKKELNYFAKA    200
## 
## P1               201 LESPERPFLAILGGAKVADKIQLINNMLDKVNEMIIGGGMAFTFLKVLNN    250
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               201 LESPERPFLAILGGAKVADKIQLINNMLDKVNEMIIGGGMAFTFLKVLNN    250
## 
## P1               251 MEIGTSLFDEEGAKIVKDLMSKAEKNGVKITLPVDFVTADKFDENAKTGQ    300
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               251 MEIGTSLFDEEGAKIVKDLMSKAEKNGVKITLPVDFVTADKFDENAKTGQ    300
## 
## P1               301 ATVASGIPAGWMGLDCGPESSKKYAEAVTRAKQIVWNGPVGVFEWEAFAR    350
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               301 ATVASGIPAGWMGLDCGPESSKKYAEAVTRAKQIVWNGPVGVFEWEAFAR    350
## 
## P1               351 GTKALMDEVVKATSRGCITIIGGGDTATCCAKWNTEDKVSHVSTGGGASL    400
##                      ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1               351 GTKALMDEVVKATSRGCITIIGGGDTATCCAKWNTEDKVSHVSTGGGASL    400
## 
## P1               401 ELLEGKVLPGVDALSNI    417
##                      |||||||||||||||||
## S1               401 ELLEGKVLPGVDALSNI    417
## 
## 
## #---------------------------------------
## #---------------------------------------

Calculating new coordinates

To calculate the genomic coordinates of peptides we

1) Calculate the contiguous exon coordinates on the cDNA sequence prex.

2) Calculate the peptide corrdinates on the cDNA sequence peprgn2 from the original peptide ranges along the protein sequence.

3) We calculate the indices of the exons in which all the peptides start and end on the cDNA sequence start_ex and end_ex.

4) We infer the coordinates on the genome from the actual peptide starting/ending position and exon index on the cDNA sequence, the exons coordinates on the cDNA sequences and the matching exons coordinates on the genome and store these in an IRanges object.

## exons ranges along the protein
j <- cumsum(width(seq))
i <- cumsum(c(1, width(seq)))[1:length(j)]
prex <- IRanges(start = i, end = j)

peprng <- pranges(pp)[[1]]
## peptide positions on cdna
peprng2 <- IRanges(start = 1 + (start(peprng)-1) * 3,
                   width = width(peprng) * 3)

## find exon and position in prex 
start_ex <- subjectHits(findOverlaps(start(peprng2), prex))
end_ex <- subjectHits(findOverlaps(end(peprng2), prex))

getPos <- function(p, i, prtex = prex, nclex = prng2) {
    ## position in cdna
    ## exon index
    start(prng2[i]) + (p - start(prtex[i]))
}

peptides_on_genome <- IRanges(start = getPos(start(peprng2), start_ex),
                              end = getPos(end(peprng2), end_ex))

Plotting

Based on the new peptide genomic coordinates, it is now straightforward to create a new AnnotationTrack and add it the the track visualisation.

pepTr <- AnnotationTrack(start = start(peptides_on_genome),
                         end = end(peptides_on_genome),
                         chr = "chrX", genome = "hg19",
                         strand = "*",
                         id = pcols(pp)[[1]]$pepseq,
                         name = "pfeatures",
                         col = "steelblue")


plotTracks(list(ideoTrack, axisTrack, grTrack, pepTr),
           groupAnnotation = "id",
           just.group = "below",
           fontsize.group = 9,
           add53 = TRUE, add35 = TRUE)

plot of chunk pepcoords

DetailsAnnotationTrack

Finally, we customise the figure by adding a track with the \(MS^2\) spectra. The raw data used to search the protein database an create p are available as an MSnExp object.

data(pms)

library("ggplot2")
details <- function(identifier, ...) {
    p <- plot(pms[[as.numeric(identifier)]], full=TRUE, plot=FALSE) + ggtitle("") 
    p <- p + theme_bw() + theme(axis.text.y = element_blank(),
                                axis.text.x = element_blank()) + 
                                labs(x = NULL, y = NULL)

    print(p, newpage=FALSE)
}

deTrack <- AnnotationTrack(start = start(peptides_on_genome),
                           end = end(peptides_on_genome),
                           genome = "hg19", chromosom = "chrX",
                           id = pcols(pp)[[1]]$acquisitionnum,
                           name = "MS2 spectra",
                           stacking = "squish", fun = details)

plotTracks(list(ideoTrack, axisTrack, deTrack, grTrack),
           add53 = TRUE, add35 = TRUE)

plot of chunk msmsspectra

TODO Check spectra. Describe how data tracks can be used to overlay additional information, such as quantitation data, identification scores, coverage, …

Session information

sessionInfo()
## R version 3.1.1 Patched (2014-09-25 r66681)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] stats4    grid      parallel  stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] TxDb.Hsapiens.UCSC.hg19.knownGene_3.0.0
##  [2] GenomicFeatures_1.18.0                 
##  [3] AnnotationDbi_1.28.0                   
##  [4] Biobase_2.26.0                         
##  [5] ggplot2_1.0.0                          
##  [6] BSgenome.Hsapiens.UCSC.hg19_1.4.0      
##  [7] BSgenome_1.34.0                        
##  [8] rtracklayer_1.26.0                     
##  [9] biomaRt_2.22.0                         
## [10] mzID_1.4.0                             
## [11] Biostrings_2.34.0                      
## [12] XVector_0.6.0                          
## [13] Pbase_0.4.0                            
## [14] Gviz_1.10.0                            
## [15] GenomicRanges_1.18.0                   
## [16] GenomeInfoDb_1.2.0                     
## [17] IRanges_2.0.0                          
## [18] S4Vectors_0.4.0                        
## [19] Rcpp_0.11.3                            
## [20] BiocGenerics_0.12.0                    
## [21] BiocStyle_1.4.0                        
## 
## loaded via a namespace (and not attached):
##  [1] BBmisc_1.7               BatchJobs_1.4           
##  [3] BiocInstaller_1.16.0     BiocParallel_1.0.0      
##  [5] DBI_0.3.1                Formula_1.1-2           
##  [7] GenomicAlignments_1.2.0  Hmisc_3.14-5            
##  [9] MALDIquant_1.11          MASS_7.3-35             
## [11] MSnbase_1.14.0           Pviz_1.0.0              
## [13] R.methodsS3_1.6.1        RColorBrewer_1.0-5      
## [15] RCurl_1.95-4.3           RSQLite_0.11.4          
## [17] Rsamtools_1.18.0         VariantAnnotation_1.12.0
## [19] XML_3.98-1.1             acepack_1.3-3.3         
## [21] affy_1.44.0              affyio_1.34.0           
## [23] base64enc_0.1-2          biovizBase_1.14.0       
## [25] bitops_1.0-6             brew_1.0-6              
## [27] checkmate_1.4            chron_2.3-45            
## [29] cleaver_1.4.0            cluster_1.15.3          
## [31] codetools_0.2-9          colorspace_1.2-4        
## [33] data.table_1.9.4         dichromat_2.0-0         
## [35] digest_0.6.4             doParallel_1.0.8        
## [37] evaluate_0.5.5           fail_1.2                
## [39] foreach_1.4.2            foreign_0.8-61          
## [41] formatR_1.0              gtable_0.1.2            
## [43] impute_1.40.0            iterators_1.0.7         
## [45] knitr_1.7                labeling_0.3            
## [47] lattice_0.20-29          latticeExtra_0.6-26     
## [49] limma_3.22.0             markdown_0.7.4          
## [51] matrixStats_0.10.0       mime_0.2                
## [53] munsell_0.4.2            mzR_2.0.0               
## [55] nnet_7.3-8               pcaMethods_1.56.0       
## [57] plyr_1.8.1               preprocessCore_1.28.0   
## [59] proto_0.3-10             reshape2_1.4            
## [61] rpart_4.1-8              scales_0.2.4            
## [63] sendmailR_1.2-1          splines_3.1.1           
## [65] stringr_0.6.2            survival_2.37-7         
## [67] tools_3.1.1              vsn_3.34.0              
## [69] zlibbioc_1.12.0