1 Why bams?

seqsetvis can load profiles from bigWigs or construct profiles of read pileups from bams, which are binary sam files. Binary files are not human readable but are compressed and provide rapid access for computers,

Compared to bam files, bigWigs generally take up much less disk space and are faster to load data from. They are also standard outputs used by many powerful NGS analysis tools (or can be easily generated from bedGraph files) and are accepted as inputs by common genome browsers (UCSC, WashU, IGV etc.).

However, assessing read pileups from bam files directly allows for more flexibilty and is extremely powerful for quality control in ChIP-seq and related data. This vignette will demonstrate methods for loading data from bam files and how the results are useful in assessing ChIP-seq data quality.

library(seqsetvis)
library(data.table)
library(GenomicRanges)

2 The simplest case

A straightforward pileup.

bam_dt = ssvFetchBam(bam_file, query_gr, return_data.table = TRUE)
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## Warning in load_signal(f, nam, qgr): creating index for /tmp/RtmpKP1IF2/
## Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam
## fragLen was calculated as: 172
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic = p_default + 
    geom_path(data = bam_dt, aes(x = x, y = y, color = strand))
p_basic
Read pileup with default parameters

Figure 1: Read pileup with default parameters

bam_dt = ssvFetchBam(bam_file, 
                     query_gr, 
                     fragLens = NA, 
                     win_size = 5, 
                     return_data.table = TRUE)
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic = p_default + 
    geom_path(data = bam_dt, aes(x = x, y = y, color = strand))
p_basic
Read pileup with default parameters

Figure 2: Read pileup with default parameters

bam_dt = ssvFetchBam(bam_file,
                     query_gr,
                     fragLens = NA, 
                     win_size = 5, 
                     return_data.table = TRUE,
                     target_strand = "both")
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic + geom_path(data = bam_dt, aes(x = x, y = y, color = strand)) + facet_grid("peak_id~peak_status")
Strand sensitive read pileup

(#fig:basic_strand)Strand sensitive read pileup

3 Considering strands and disabling extension to fragment length.

bam_dt = ssvFetchBam(bam_file,
                     win_size = 5,
                     fragLens = NA,
                     query_gr,
                     return_data.table = TRUE,
                     target_strand = "both")
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic + geom_path(data = bam_dt, aes(x = x, y = y, color = strand))
no extension to fragment length

(#fig:basic_NAfragLens)no extension to fragment length

bam_dt = ssvFetchBam(bam_file,
                     win_size = 10,
                     fragLens = "auto",
                     query_gr,
                     return_data.table = TRUE,
                     max_dupes = 1,
                     target_strand = "both")
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## fragLen was calculated as: 172
## fragLen was calculated as: 172
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic + geom_path(data = bam_dt, aes(x = x, y = y, color = strand)) 

shift_dt = crossCorrByRle(bam_file, query_gr)
shift_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
ggplot(shift_dt, aes(x = shift, y = correlation)) + 
    geom_path() + 
    facet_grid("peak_id~peak_status") +
    annotate("line", x = rep(172, 2), y = range(shift_dt$correlation), color = "red")
Calculated fragment length doesn't match peak strand cross correlation very well.

Figure 3: Calculated fragment length doesn’t match peak strand cross correlation very well

shift_dt[, .(max_shift = shift[which.max(correlation)]), .(peak_status, peak_id)]
##    peak_status peak_id max_shift
## 1:        peak       4       137
## 2:        peak       7       116
## 3:        peak       2       141
## 4:        peak       3       155
## 5:    negative       4       262
## 6:    negative       7        50
## 7:    negative       2        78
## 8:    negative       3       300
fl = fragLen_calcStranded(bam_file, subset(query_gr, grepl("peak", name)))
fl
## [1] 138
bam_dt = ssvFetchBam(bam_file,
                     win_size = 10,
                     fragLens = 141,
                     query_gr,
                     return_data.table = TRUE,
                     max_dupes = 1,
                     target_strand = "both")
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/test_peaks.bam.
bam_dt[, c("peak_status", "peak_id") := tstrsplit(id, "_", keep = c(4,5))]
p_basic + geom_path(data = bam_dt, aes(x = x, y = y, color = strand))

4 PE

Paired-end data is a bit different.

data("Bcell_peaks")

pe_default = ssvFetchBamPE(pe_file, Bcell_peaks)
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/Bcell_PE.mm10.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/Bcell_PE.mm10.bam.
pe_raw = ssvFetchBamPE(pe_file, Bcell_peaks, return_unprocessed = TRUE)
## loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/Bcell_PE.mm10.bam ...
## finished loading /tmp/RtmpKP1IF2/Rinst1dd91bd83f6f/seqsetvis/extdata/Bcell_PE.mm10.bam.
pe_raw[isize > 0, mean(isize), .(id)]
##               id       V1
## 1: H31_peak_4369 157.9487
## 2: H31_peak_5756 163.8020
## 3: H31_peak_7132 156.3113
## 4: H31_peak_9036 382.1304
ggplot(pe_raw[isize > 0], aes(x = isize)) + geom_histogram() + facet_wrap("id")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.



shift_dt = crossCorrByRle(pe_file, Bcell_peaks, flip_strand = TRUE)
shift_dt[, shift[which.max(correlation)], .(id)]
##               id  V1
## 1: H31_peak_5756 172
## 2: H31_peak_9036  50
## 3: H31_peak_7132 149
## 4: H31_peak_4369 155
ggplot(shift_dt, aes(x = shift, y = correlation)) + 
    geom_path() + 
    facet_wrap("id") +
    annotate("line", x = rep(160, 2), y = range(shift_dt$correlation), color = "red")