## ----dontrun-basics-vignette, eval=FALSE--------------------------------- # vignette("phyloseq-basics") ## ----load-packages, message=FALSE, warning=FALSE------------------------- library("phyloseq") library("ggplot2") ## ----ggplot2-themes------------------------------------------------------ theme_set(theme_bw()) pal = "Set1" scale_colour_discrete <- function(palname = pal, ...) { scale_colour_brewer(palette = palname, ...) } scale_fill_discrete <- function(palname = pal, ...) { scale_fill_brewer(palette = palname, ...) } ## ------------------------------------------------------------------------ data(GlobalPatterns) ## ------------------------------------------------------------------------ # prune OTUs that are not present in at least one sample GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns) # Define a human-associated versus non-human categorical variable: human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue") # Add new human variable to sample data: sample_data(GP)$human <- factor(human) ## ----richness_estimates0, fig.width=13, fig.height=7--------------------- alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson") (p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas)) ## ----richness_estimates, fig.width=13,height=7--------------------------- p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1) ## ------------------------------------------------------------------------ GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae") ## ----GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE---- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance") ## ------------------------------------------------------------------------ data(enterotype) ## ----EntAbundPlot, fig.height=6, fig.width=8----------------------------- par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin N <- 30 barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2) ## ------------------------------------------------------------------------ rank_names(enterotype) ## ------------------------------------------------------------------------ TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10]) ent10 <- prune_taxa(TopNOTUs, enterotype) print(ent10) ## ------------------------------------------------------------------------ sample_variables(ent10) ## ----entbarplot0, fig.height=6, fig.width=10----------------------------- plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus) ## ----GPheatmap----------------------------------------------------------- data("GlobalPatterns") gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") (p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family")) ## ----plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE---- data(enterotype) ig <- make_network(enterotype, max.dist=0.3) plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.4, label=NULL) ## ----eval=FALSE---------------------------------------------------------- # my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom") # my.ord <- ordinate(my.physeq) # plot_ordination(my.physeq, my.ord, color="myFavoriteVarible") ## ----help-import, eval=FALSE--------------------------------------------- # help(import) # help(ordinate) # help(distance) # help(plot_ordination) ## ----GP-data-load-------------------------------------------------------- data(GlobalPatterns) ## ----, eval=FALSE-------------------------------------------------------- # GPUF <- UniFrac(GlobalPatterns) ## ----load-precomputed-UF------------------------------------------------- load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) ## ------------------------------------------------------------------------ GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF) ## ----PCoAScree, fig.width=6, fig.height=4-------------------------------- plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA") ## ----GPfig5ax1213-------------------------------------------------------- (p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") + geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) ) (p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3), color="SampleType") + geom_line() + geom_point(size=5) ) ## ----GP_UF_NMDS0--------------------------------------------------------- # (Re)load UniFrac distance matrix and GlobalPatterns data data(GlobalPatterns) load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) # perform NMDS, set to 2 axes GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF) (p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") + geom_line() + geom_point(size=5) ) ## ----GPCAscree0, fig=FALSE----------------------------------------------- data(GlobalPatterns) # Take a subset of the GP dataset, top 200 species topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200]) GP <- prune_taxa(topsp, GlobalPatterns) # Subset further to top 5 phyla, among the top 200 OTUs. top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5] GP <- subset_taxa(GP, Phylum %in% names(top5ph)) # Re-add human variable to sample data: sample_data(GP)$human <- factor(human) ## ----GPCAscree, fig.width=8, fig.height=5-------------------------------- # Now perform a unconstrained correspondence analysis gpca <- ordinate(GP, "CCA") # Scree plot plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis") ## ----GPCA1234------------------------------------------------------------ (p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") + geom_line() + geom_point(size=5) ) (p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") + geom_line() + geom_point(size=5) ) ## ----GPCAspecplot0------------------------------------------------------- p1 <- plot_ordination(GP, gpca, "species", color="Phylum") (p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) + facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) ) ## ----GPCAspecplotTopo0--------------------------------------------------- (p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() + facet_wrap(~Phylum) + scale_colour_hue(guide = FALSE) ) ## ----GPCAjitter0--------------------------------------------------------- library("reshape2") # Melt the species-data.frame, DF, to facet each CA axis separately mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")], id=c("Phylum", "Family", "Genus") ) # Select some special outliers for labelling LF <- subset(mdf, variable=="CA2" & value < -1.0) # build plot: boxplot summaries of each CA-axis, with labels p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) + geom_boxplot() + facet_wrap(~variable, 2) + scale_colour_hue(guide = FALSE) + theme_bw() + theme( axis.text.x = element_text(angle = -90, hjust = 0) ) # Add the text label layer, and render ggplot graphic (p <- p + geom_text(aes(Phylum, value+0.1, color=Phylum, label=Family), data=LF, vjust=0, size=2) ) ## ----GPtaxaplot0--------------------------------------------------------- plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum) ## ----GPdpcoa01----------------------------------------------------------- GP.dpcoa <- ordinate(GP, "DPCoA") pdpcoa <- plot_ordination(GP, GP.dpcoa, type="biplot", color="SampleType", shape="Phylum") shape.fac <- pdpcoa$data[, deparse(pdpcoa$mapping$shape)] man.shapes <- c(19, 21:25) names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"]) p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes) ## ----GPdpcoa02----------------------------------------------------------- # Show just Samples or just Taxa plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum") plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType") # Split plot_ordination(GP, GP.dpcoa, type="split", color="SampleType", shape="Phylum") ## ----distancefun--------------------------------------------------------- data(esophagus) distance(esophagus) # Unweighted UniFrac distance(esophagus, weighted=TRUE) # weighted UniFrac distance(esophagus, "jaccard") # vegdist jaccard distance(esophagus, "g") # betadiver method option "g" distance("help") ## ----eval=FALSE, echo=TRUE----------------------------------------------- # data(esophagus) # UniFrac(esophagus, weighted=TRUE) # # distance(esophagus, weighted=TRUE) # Alternative using the distance() function # UniFrac(esophagus, weighted=FALSE) # # distance(esophagus) # Alternative using the distance() function ## ----echo=FALSE---------------------------------------------------------- round( UniFrac(esophagus, weighted=TRUE), 3) round( UniFrac(esophagus, weighted=FALSE), 3) ## ------------------------------------------------------------------------ # (Re)load UniFrac distance matrix and GlobalPatterns data data(GlobalPatterns) load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq")) # Manually define color-shading vector based on sample type. colorScale <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType")))) cols <- colorScale[get_variable(GlobalPatterns, "SampleType")] GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character") # This is the actual hierarchical clustering call, specifying average-link clustering GP.hclust <- hclust(GPUF, method="average") plot(GP.hclust, col=cols) ## ----eval=FALSE---------------------------------------------------------- # data(enterotype) # # Filter samples that don't have Enterotype classification. # x <- subset_samples(enterotype, !is.na(Enterotype)) # # Calculate the multiple-inference-adjusted P-values # ent.p.table <- mt(x, "Enterotype", test="f", B=1000) # print(head(ent.p.table, 10))