The data analyzed here were generated as part of the larger study reported in “A spatial gradient of bacterial diversity in the human oral cavity shaped by salivary flow”, Proctor et al., Nature communications, 2018: https://doi.org/10.1038/s41467-018-02900-1
Prepare workspace
#load packages
cran_packages <- c("reshape2", "ggplot2", "vegan", "stringr", "gridExtra", "ape", "RColorBrewer", "dplyr", "knitr","cowplot","openxlsx", "circlize", "plotly")
bioc_packages <- c("phyloseq","decontam","ComplexHeatmap")
sapply(c(cran_packages, bioc_packages), require, character.only = TRUE)
## reshape2 ggplot2 vegan stringr gridExtra
## TRUE TRUE TRUE TRUE TRUE
## ape RColorBrewer dplyr knitr cowplot
## TRUE TRUE TRUE TRUE TRUE
## openxlsx circlize plotly phyloseq decontam
## TRUE TRUE TRUE TRUE TRUE
## ComplexHeatmap
## TRUE
#settings
theme_set(theme_bw())
options(stringsAsFactors = FALSE)
path.in <- "OralContamination/" # CHANGE ME
path.out <- "Figures/" # CHANGE ME
#color palettes
cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#000000")
methodPalette <- c(Frequency="darkorange2", Prevalence="turquoise3", Combined="darkorchid4")
prevalencePalette <- c("2" = "#edf8e9", "3-5" = "#bae4b3", "6-10" = "#74c476", "11+" = "#238b45")
#load phyloseq object
load(file.path(path.in, "hypo_sa1_0001_DADA2_ps_new.Rdata"))
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2420 taxa and 767 samples ]
## sample_data() Sample Data: [ 767 samples by 42 sample variables ]
## tax_table() Taxonomy Table: [ 2420 taxa by 6 taxonomic ranks ]
Subset Samples
There may be differences between DNA extracted in PowerSoil plates vs. tubes. For these analyses, subset to just include plate samples, and remove any barcode replicates (“R04” samples).
#remove outlier sample that was added to the DNA pool twice
MUC <- subset_samples(ps, X.SampleID != "P1104C08703R00")
#for contamination analysis, keep only samples that were extracted in 96-well plates
MUC <- subset_samples(MUC, Extraction.Protocol == "plate")
#the 'Replicate' column indicates whether the sample is a 'normal' sample (R00) or whether the sample is a type of derivative sample, e.g. 'R04' indicates the sample's DNA was PCR amplified a second time with a different primer barcode
MUC <- subset_samples(MUC, Replicate == "R00")
#remove singletons (there shouldn't be any from DADA2; only singletons might be strays after subsetting to plate samples)
MUC <- prune_taxa(taxa_sums(MUC) > 1, MUC)
MUC <- prune_samples(sample_sums(MUC) > 0, MUC)
#compare unfiltered data to plate-extracted, non-barcode replicate samples that lack singletons.
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2420 taxa and 767 samples ]
## sample_data() Sample Data: [ 767 samples by 42 sample variables ]
## tax_table() Taxonomy Table: [ 2420 taxa by 6 taxonomic ranks ]
MUC
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1951 taxa and 569 samples ]
## sample_data() Sample Data: [ 569 samples by 42 sample variables ]
## tax_table() Taxonomy Table: [ 1951 taxa by 6 taxonomic ranks ]
MUCprev <- transform_sample_counts(MUC, function(x) ifelse(x>0, 1, 0))
MUCrelabund <- transform_sample_counts(MUC, function(x) x/sum(x))
Extract OTU table and sample metadata as base R data.frames
#get sample sums: each sample's sequencing depth
sample_data(MUC)$sample_sums <- sample_sums(MUC)
MAP <- data.frame(sample_data(MUC))
OTU <- data.frame(as(otu_table(MUC), 'matrix'))
# TAX <- data.frame(tax_table(MUC)) # These older assignments will be replaced by Silva v128 assignments
Load annotated taxonomy table
This includes the DNA sequences, HOMD taxonomic assignments, and by-hand resolutions of discrepancies between Silva and HOMD.
TAXannotate <- read.csv(file.path(path.in, "TAXannotate.csv"))
rownames(TAXannotate) <- TAXannotate$Id
TAXannotate <- TAXannotate[taxa_names(MUC),] # subset down to the taxa being analyzed
Assign taxonomy using SILVA v128
The stored Silva assignments used an older version of the database, so we rerun taxonomic assignment using Silva v128 and dada2’s assignTaxonomy
function.
# Function to simplify Silva assignments for consistency with genus reporting in literature
parse_silva_genus <- Vectorize(function(gen) {
gen <- gsub("\\[", "", gen)
gen <- gsub("\\]", "", gen)
gen <- gsub("Escherichia/Shigella", "Escherichia", gen)
gen <- gsub("Burkholderia-Paraburkholderia", "Burkholderia", gen)
gen <- gsub("Burkholderia-Caballeronia-Paraburkholderia", "Burkholderia", gen)
gen <- gsub("_.*$", "", gen)
if(is.na(gen)) gen <- "unknown"
gen
})
# Uncomment the following to run w/o the taxtabSilva128.rds file
# set.seed(100)
# library(dada2); packageVersion("dada2")
# t128 <- assignTaxonomy(TAXannotate$Sequence, "~/tax/silva_nr_v128_train_set.fa.gz", multithread=TRUE)
# t128[,"Genus"] <- parse_silva_genus(t128[,"Genus"])
# saveRDS(t128, file.path(path.in, "t128.rds"))
t128 <- readRDS(file.path(path.in, "taxtabSilva128.rds"))
TAXannotate[,colnames(t128)] <- t128
TAXannotate[,"GenusSILVA"] <- t128[,"Genus"]
Augment the annotated taxonomy table.
This includes includes assigning the recommended Genus column, as decided by hand between Silva and HOMD assignments:
# 'GenusREC' column will be the recommended taxonomy, based on Taxonomy_recommendation notes.
# assign SILVA to anything that's 'both'
# assign 'neither' to anything that's in one sample or has no recommendation from SILVA or HOMD
unique(TAXannotate$Taxonomy_recommendation)
## [1] "GenusSILVA" "neither at genus level"
## [3] "GenusHOMD" "only in one sample"
## [5] "both"
# Values are: "only in one sample", "GenusSILVA", "GenusHOMD", "both", "neither at genus level"
TAXannotate$GenusREC <- "neither" # Default is unassigned
use.silva <- TAXannotate$Taxonomy_recommendation %in% c("GenusSILVA", "both")
TAXannotate$GenusREC[use.silva] <- TAXannotate$GenusSILVA[use.silva] # Assign Silva genus
use.homd <- TAXannotate$Taxonomy_recommendation %in% c("GenusHOMD")
TAXannotate$GenusREC[use.homd] <- TAXannotate$GenusHOMD[use.homd] # Assign HOMD genus
# Augment with abundance and prevalence information
TAXannotate$TAXsums <- taxa_sums(MUC)[TAXannotate$Id] # Add total abundances
TAXannotate$TAXprev <- taxa_sums(MUCprev)[TAXannotate$Id] # Add prevalences
Filter Taxa
Apply prevalence filter to filter out taxa present in fewer than 2 samples. This is a very lenient filter.
MUC2 <- filter_taxa(MUC, function(x) sum(x > 0) > 1, TRUE)
OTU2 <- OTU[,taxa_names(MUC2)]
TAX2 <- TAXannotate[taxa_names(MUC2),]
MAP2 <- MAP[sample_names(MUC2),]
Plot Frequency Patterns
Create a plot showing examples of the two types of frequency patterns observed in the data: frequency independent of DNA concentration and inversely proportional to it (Figure 2a)
#plot_abundance function for figures (similar to the plot_frequency function in package)
plot_abundance <- function(ps, taxa_to_plot, conc, taxa_are_rows=TRUE, norm=TRUE, log=TRUE, returndf=FALSE){
ot <- as(otu_table(ps), "matrix")
if(taxa_are_rows){
ot <- t(ot)
}
taxa_mismatch <- !(taxa_names(ps) %in% colnames(ot))
if(sum(taxa_mismatch) > 0){
stop("Error: 'taxa_are_rows' argument may not be correct")
}
if(norm){
ot <- sweep(ot, 1, rowSums(ot), "/")
}
ot <- as(ot[,colnames(ot) %in% taxa_to_plot], "matrix")
if(dim(ot)[2] == 1){
colnames(ot) <- taxa_to_plot
}
st <- as(sample_data(ps), "data.frame")
snames <- sample_names(ps)
plot <- merge(st, ot, by.x = "row.names", by.y = "row.names", sort=FALSE)
plot_melt <- melt(plot, id.vars = colnames(plot)[1:(dim(plot)[2]-length(taxa_to_plot))])
colnames(plot_melt)[dim(plot_melt)[2]-1] <- "taxa_to_plot"
colnames(plot_melt)[dim(plot_melt)[2]] <- "taxon_abundance"
taxon_levels <- taxa_to_plot
plot_melt$taxa_to_plot <- factor(plot_melt$taxa_to_plot, levels = taxon_levels)
I <- which(colnames(plot_melt) == conc)
colnames(plot_melt)[I] <- "DNA_conc"
if(returndf == FALSE){
if(log==TRUE){
p1 <- ggplot(plot_melt, aes(log(DNA_conc),log(taxon_abundance)))
return(p1 + geom_point())
} else if(log==FALSE){
p1 <- ggplot(plot_melt, aes(DNA_conc,taxon_abundance))
return(p1 + geom_point())
}
} else if(returndf == TRUE){
return(plot_melt)
}
}
#Figure 2a
f2a <- plot_abundance(MUC,c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),
"quant_reading",taxa_are_rows=FALSE)
f2a <- f2a + facet_wrap(~taxa_to_plot,nrow=1) +
labs(x= 'log(DNA concentration)', y = 'log(Frequency)') +
# geom_point(aes(color=Sample_or_Control)) +
geom_point() +
# theme(axis.text.x = element_text(colour="grey20",size=15,angle=0,
# hjust=.5,vjust=.5,face="plain"),
# axis.text.y = element_text(colour="grey20",size=10,angle=0,
# hjust=1,vjust=0,face="plain"),
# axis.title.x = element_text(colour="grey20",size=15,angle=0,
# hjust=.5,vjust=0,face="plain"),
# axis.title.y = element_text(colour="grey20",size=15,angle=90,
# hjust=.5,vjust=.5,face="plain"),
# strip.text.x = element_text(size = 15, angle = 0),
# legend.title = element_text(size = 15, angle = 0),
# legend.text = element_text(size = 15, angle = 0)) +
theme(legend.position = 'top')
f2a
Classify contaminants
Using both the frequency and prevalence methods, and also the combined method that classifies based on a score that is a composite of the frequency and pevalence scores.
conc2 <- MAP2$quant_reading # PicoGreen fluroescent intensity
neg2 <- MAP2$Sample_or_Control == 'Control Sample' # True if a negative control sample
# Frequency-based contaminant classification
ocf <- isContaminant(as.matrix(OTU2), conc=conc2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='frequency')
# Prevalence-based contaminant classification
ocp <- isContaminant(as.matrix(OTU2), neg=neg2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='prevalence')
# Combined contaminant classification
occ <- isContaminant(as.matrix(OTU2), conc=conc2, neg=neg2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='combined')
Combine scores from each method into a data.frame, and add to TAX2.
probcols <- data.frame(row.names=rownames(ocf),prob.f=ocf$p.freq, prob.p=ocp$p.prev, prob.c = occ$p)
TAX2 <- cbind(TAX2[,colnames(TAXannotate)], probcols[TAX2$Id,])
# Crude comparison of frequency and prevalence contaminant assignment
table(probcols$prob.f<0.1, probcols$prob.c<0.1)
##
## FALSE TRUE
## FALSE 711 12
## TRUE 8 44
Solid agreement.
Reference classification
Reference databases were generated for each genus from the HOMD database, and by literatures search for cultivated oral taxa and reported contaminant genera. Load the reference database files & organize genera from the reference databases.
contam_database <- read.csv(file.path(path.in, "contamination_database.csv"))
oral_database <- read.csv(file.path(path.in, "oral_database.csv"))
HOMD <- read.csv(file.path(path.in, "homd_taxonomy_table.csv"))
### Oral genera
#genera that have been visualized in the mouth.
oral_visualized <- unique(oral_database$Genus)
#genera that have been named & cultivated [BJC: removed not visualized condition]
oral_cultivated <- unique(HOMD$Genus[HOMD$Status %in% c('Named','Unnamed')])
#oral_cultivated <- oral_cultivated[!(oral_cultivated %in% oral_visualized)]
# All genera detected in the mouth
oral_all <- unique(c(oral_visualized, oral_cultivated))
### Contaminant genera
#genera that have been identified as contaminants in previous studies in the literature.
contaminatinggenera <- contam_database %>%
group_by(Genus) %>%
mutate(numberstudies = length(unique(Reference)))
## Warning: package 'bindrcpp' was built under R version 3.4.4
contam_df <- as.data.frame(contaminatinggenera)
#genera that have been identified as contaminants in more than one previous study
contam_multiple <- unique(contaminatinggenera$Genus[contaminatinggenera$numberstudies > 1])
contam_multiple <- contam_multiple[contam_multiple != "unspecified"] #68 of these
#genera that have been identified as contaminants in at least one previous study
contam_one <- unique(contaminatinggenera$Genus[contaminatinggenera$Genus != 'unspecified']) #213 of these
###BJC: Lists of genera w/ short names
oral_cult <- unique(HOMD$Genus[HOMD$Status %in% c('Named','Unnamed')])
oral_vis <- unique(oral_database$Genus)
oral <- unique(c(oral_cult, oral_vis))
contam <- unique(contam_df$Genus)
contam2 <- unique(contam_df$Genus[contam_df$numberstudies>1])
Categorize taxa according to reference databases
tt <- TAX2 # Working version of TAX2
tt$Genus <- tt$GenusREC
### Categorize
# Oral, if in oral db and not in contam db.
# Contaminant, if reported at least twice as a contaminant, and not in oral db.
# Ambiguous otherwise.
tt$Group <- "Ambiguous" # Default
is.oral <- tt$Genus %in% oral & !tt$Genus %in% contam
tt$Group[is.oral] <- "Oral"
is.contam <- tt$Genus %in% contam & !tt$Genus %in% oral
tt$Group[is.contam] <- "Contaminant"
# Augment with prevalence groups and Abundance column
tt$Prevalence <- cut(tt$TAXprev, c(0, 2, 5, 10, 9999), labels=c("2", "3-5", "6-10", "11+"))
tt$Abundance <- tt$TAXsums
Inspect categories.
table(tt$Group)
##
## Ambiguous Contaminant Oral
## 696 54 97
tapply(tt$Abundance, tt$Group, sum)/sum(sum(tt$TAXsums))
## Ambiguous Contaminant Oral
## 0.937759276 0.006586119 0.055654605
top.genera <- head(names(sort(tapply(tt$Abundance, tt$Genus, sum), decreasing=TRUE)))
in.both <- top.genera %in% oral & top.genera %in% contam
names(in.both) <- top.genera
in.both
## Streptococcus Haemophilus Neisseria Prevotella Gemella
## TRUE TRUE TRUE TRUE TRUE
## Rothia
## TRUE
A minority of taxa and reads are being unambiguously assigned, but this is the most useful fraction for assessing classification accuracy, and it is still a non-trival number (~150/850 taxa, ~6% of total reads). Fewer reads are being assigned because the abundant genera (e.g. Steptococcus) are expansive, and unsurprisingly show up in both the oral and contaminant databases.
Plot score distribution on the oral dataset.
This is Figure 2b, with full histogram for prevalence, frequency and combined methods.
tt.class <- tt[!is.na(tt$prob.c),] # 775 classified by decontam, aka "All ASVs"
tt.class.contam <- tt.class[tt.class$Group == "Contaminant",]
tt.class.oral <- tt.class[tt.class$Group == "Oral",]
TAXann <- rbind(cbind(tt.class, Score=tt.class$prob.f, Method="Frequency"),
cbind(tt.class, Score=tt.class$prob.p, Method="Prevalence"),
cbind(tt.class, Score=tt.class$prob.c, Method="Combined"))
TAXann$Method <- factor(TAXann$Method, levels=c('Frequency','Prevalence','Combined'))
#figure 2b
histo <- ggplot(TAXann, aes(x=Score, fill=Prevalence))
histo <- histo + geom_histogram() + labs(x = 'decontam Score', y='Number ASVs') +
facet_wrap(~Method, nrow=1) +
#http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=4
scale_fill_manual(values=prevalencePalette) +
# theme(axis.text.x = element_text(colour="grey20",size=10,angle=0,
# hjust=.5,vjust=.5,face="plain"),
# axis.text.y = element_text(colour="grey20",size=10,angle=0,
# hjust=1,vjust=0,face="plain"),
# axis.title.x = element_text(colour="grey20",size=15,angle=0,
# hjust=.5,vjust=0,face="plain"),
# axis.title.y = element_text(colour="grey20",size=15,angle=90,
# hjust=.5,vjust=.5,face="plain"),
# strip.text.x = element_text(size = 15, angle = 0),
# legend.title = element_text(size = 15, angle = 0),
# legend.text = element_text(size = 10, angle = 0)) +
theme(legend.position = "bottom")
histo
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
###ggsave(file.path(path.out, "Oral_Score_Histogram.pdf"), histo, width=7, height=2.6, units="in", useDingbats=FALSE)
###ggsave(file.path(path.out, "Oral_Score_Histogram.png"), histo, width=7, height=2.6, units="in")
Plot score distributions for difference reference categories
Supplementary figure showing score histograms for genera that are ambiguous by reference classification.
histo.class <- ggplot(data=TAXann, aes(x=Score, fill=Prevalence)) +
geom_histogram() + labs(x = 'decontam Score', y='Number ASVs') +
facet_grid(Group~Method, scales="free_y") +
scale_fill_manual(values=prevalencePalette) +
theme(legend.position = "bottom")
histo.class
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave(file.path(path.out, "Oral_Score_Histogram_byReferenceClassification.pdf"), histo.class, width=7, height=4.8, units="in", useDingbats=FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave(file.path(path.out, "Oral_Score_Histogram_byReferenceClassification.png"), histo.class, width=7, height=4.8, units="in")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plot score distribution on the oral dataset, weighted by abundance.
This is the Supplementary Figure with full histogram weighted by abundance for the prevalence, frequency and combined methods. This is the abundance-weighted version of Figure 2.
histo.wt <- histo + aes(weight=Abundance)
histo.wt
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
histo.wt.log <- histo.wt + scale_y_log10(name="Reads") +
aes(fill=Method) + scale_fill_manual(values=methodPalette)
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
# Not a correct histogram if fill specified, as logged values are stacked inappropriately
histo.wt.log
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7 rows containing missing values (geom_bar).
###ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted.pdf"), histo.wt, width=7, height=2.6, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted_logY.pdf"), histo.wt.log, width=7, height=2.6, units="in", useDingbats=FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7 rows containing missing values (geom_bar).
ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted_logY.png"), histo.wt.log, width=7, height=2.6, units="in")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7 rows containing missing values (geom_bar).
Pub quality figure (w/ Ambiguous in background):
scat <- ggplot(data=tt[tt$Group != "Ambiguous",], aes(x=prob.f, y=prob.p, color=Group, size=Abundance))
scat <- scat + scale_colour_manual(values = c("Oral"="blue", "Contaminant"="red", "Ambiguous"="gray80"), name="Reference\nClassification")
scat <- scat + geom_point(data=tt[tt$Group == "Ambiguous",], alpha=0.3)
scat <- scat + geom_point(alpha=0.3)
scat <- scat + labs(x = 'Score (Frequency)', y='Score (Prevalence)') + guides(size=FALSE)
scat <- scat + facet_grid(~Prevalence)
scat <- scat + guides(colour = guide_legend(override.aes = list(alpha = 1)))
scat <- scat + scale_x_continuous(breaks=c(0,0.25,0.5,0.75,1), labels=c("0", "0.25", "0.5", "0.75", "1"))
scat <- scat + scale_y_continuous(breaks=c(0,0.25,0.5,0.75,1), labels=c("0", "0.25", "0.5", "0.75", "1"))
scat
###ggsave(file.path(path.out, "Oral_Scores_Scatter.pdf"), scat, width=10, height=2.8, units="in", useDingbats=FALSE)
Calculate and plot the ROC for each classification method:
cc <- tt[tt$Group != "Ambiguous",] # Just reference classified taxa
get.roc <- function(df, score, weight=FALSE, thresh=0.1) {
ord <- order(df[,score], na.last=NA)
df <- df[ord,]
if(weight) {
xx <- c(0, cumsum(df$Abundance*(df$Group=="Oral"))/sum(df$Abundance*(df$Group=="Oral")))
yy <- c(0, cumsum(df$Abundance*(df$Group=="Contaminant"))/sum(df$Abundance*(df$Group=="Contaminant")))
} else {
xx <- c(0, cumsum(df$Group=="Oral")/sum(df$Group=="Oral"))
yy <- c(0, cumsum(df$Group=="Contaminant")/sum(df$Group=="Contaminant"))
}
i.thresh <- min(which(df[,score]>thresh))
ii <- seq_along(xx) == i.thresh
return(data.frame(InverseSpecificity=xx, Sensitivity=yy, Threshold=ii))
# InverseSpecificity = 1-Specificity
}
rocdf <- rbind(cbind(get.roc(cc, "prob.f"), Method="Frequency", Measure="ASVs"),
cbind(get.roc(cc, "prob.p"), Method="Prevalence", Measure="ASVs"),
cbind(get.roc(cc, "prob.c"), Method="Combined", Measure="ASVs"))
rocdf$Method <- factor(rocdf$Method, levels=c("Frequency", "Prevalence", "Combined"))
arocdf <- rbind(cbind(get.roc(cc, "prob.f", weight=TRUE), Method="Frequency", Measure="Reads"),
cbind(get.roc(cc, "prob.p", weight=TRUE), Method="Prevalence", Measure="Reads"),
cbind(get.roc(cc, "prob.c", weight=TRUE), Method="Combined", Measure="Reads"))
arocdf$Method <- factor(arocdf$Method, levels=c("Frequency", "Prevalence", "Combined"))
proc <- ggplot(data=rbind(rocdf, arocdf), aes(x=InverseSpecificity, y=Sensitivity, color=Method)) +
geom_line() + geom_abline(slope=1,intercept=0,col="black",linetype="dashed") +
facet_wrap(~Measure) +
scale_x_continuous(breaks=c(0, 0.25, 0.5, 0.75, 1), labels=c(1, 0.75, 0.5, 0.25, 0)) +
xlab("Specificity") + # Previous scale converts from 1-Specificity to Specificity
geom_point(aes(size=Threshold)) +
scale_color_manual(values=methodPalette) +
scale_size_manual(values=c("FALSE"=0, "TRUE"=2)) + guides(size=FALSE)
proc
###ggsave(file.path(path.out, "Oral_ROC.pdf"), proc, width=6, height=2.4, units="in", useDingbats=FALSE)
rocdf[rocdf$Threshold,] # InverseSpecificity here is 1-Specificity
## InverseSpecificity Sensitivity Threshold Method Measure
## 19 0.00000000 0.3333333 TRUE Frequency ASVs
## 191 0.05154639 0.6111111 TRUE Prevalence ASVs
## 321 0.01075269 0.4545455 TRUE Combined ASVs
arocdf[arocdf$Threshold,]
## InverseSpecificity Sensitivity Threshold Method Measure
## 19 0.000000e+00 0.9706820 TRUE Frequency Reads
## 191 6.409174e-04 0.7689587 TRUE Prevalence Reads
## 321 7.552549e-05 0.9796471 TRUE Combined Reads
1-arocdf[arocdf$Threshold,"InverseSpecificity"]
## [1] 1.0000000 0.9993591 0.9999245
TWITTER REQUEST: Calculate and plot the precision-recall curve for each classification method:
get.pr <- function(df, score, weight=FALSE, thresh=0.1) {
ord <- order(df[,score], na.last=NA)
df <- df[ord,]
if(weight) {
tp <- cumsum(df$Abundance*(df$Group=="Contaminant"))
fp <- cumsum(df$Abundance*(df$Group=="Oral"))
totp <- sum(df$Abundance * (df$Group=="Contaminant"))
} else {
tp <- cumsum(df$Group=="Contaminant")
fp <- cumsum(df$Group=="Oral")
totp <- sum(df$Group=="Contaminant")
}
i.thresh <- min(which(df[,score]>thresh))
ii <- seq_along(tp) == i.thresh
return(data.frame(Precision=tp/(tp+fp), Recall=tp/totp, Threshold=ii))
}
prdf <- rbind(cbind(get.pr(cc, "prob.f"), Method="Frequency", Measure="ASVs"),
cbind(get.pr(cc, "prob.p"), Method="Prevalence", Measure="ASVs"),
cbind(get.pr(cc, "prob.c"), Method="Combined", Measure="ASVs"))
prdf$Method <- factor(prdf$Method, levels=c("Frequency", "Prevalence", "Combined"))
aprdf <- rbind(cbind(get.pr(cc, "prob.f", weight=TRUE), Method="Frequency", Measure="Reads"),
cbind(get.pr(cc, "prob.p", weight=TRUE), Method="Prevalence", Measure="Reads"),
cbind(get.pr(cc, "prob.c", weight=TRUE), Method="Combined", Measure="Reads"))
aprdf$Method <- factor(aprdf$Method, levels=c("Frequency", "Prevalence", "Combined"))
ppr <- ggplot(data=rbind(aprdf), aes(x=Recall, y=Precision, color=Method)) +
geom_line() +
facet_wrap(~Measure) +
xlab("Recall") + # Previous scale converts from 1-Specificity to Specificity
geom_point(aes(size=Threshold)) +
scale_color_manual(values=methodPalette) +
scale_size_manual(values=c("FALSE"=0, "TRUE"=2)) + guides(size=FALSE)
ppr
###ggsave(file.path(path.out, "Oral_ROC.pdf"), proc, width=6, height=2.4, units="in", useDingbats=FALSE)
prdf[prdf$Threshold,] # InverseSpecificity here is 1-Specificity
## Precision Recall Threshold Method Measure
## 19 0.9473684 0.3333333 TRUE Frequency ASVs
## 190 0.8717949 0.6296296 TRUE Prevalence ASVs
## 319 0.9411765 0.4848485 TRUE Combined ASVs
aprdf[aprdf$Threshold,]
## Precision Recall Threshold Method Measure
## 19 0.9998566 0.9706820 TRUE Frequency Reads
## 190 0.9930102 0.7694226 TRUE Prevalence Reads
## 319 0.9993429 0.9800923 TRUE Combined Reads
Investigate the most glaring seeming misclassifications.
The seeming FN in the 11+ group (the red point in the upper right of that panel):
upright.11 <- tt$prob.f > 0.5 & tt$prob.p > 0.5 & tt$TAXprev > 10; sum(upright.11)
## [1] 327
i.fn.11 <- which(tt$Group %in% "Contaminant" & upright.11)
tt[i.fn.11,"Genus"] # [1] "Peptococcus"
## [1] "Peptococcus"
Peptococcus is a known inhabitant of the human mouth. Just a reference db error.
The seeming FP in the 6-10 group (the blue point in the lower left of that panel, none in 11+):
lowleft.6 <- tt$prob.f < 0.5 & tt$prob.p < 0.5 & tt$TAXprev >= 6; sum(lowleft.6)
## [1] 49
i.fp.6 <- which(tt$Group %in% "Oral" & lowleft.6)
tt[i.fp.6,"Genus"] # [1] "Moraxella" "Enterobacter" "Delftia"
## [1] "Mycobacterium"
Mycobacterium certainly colonizes humans, but it also well known as a laboratory contaminant despite not appearing in our contamination database.
Main Text Figures
Make combined publication version of Figure 2 (Frequency patterns and score distributions on oral dataset):
fig2 <- plot_grid(f2a, histo, nrow=2, rel_heights = c(0.8,1), labels = c('a','b'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig2
ggsave(file.path(path.out, "Oral_Fig2.pdf"), fig2, width=7.5, height=5, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig2.png"), fig2, width=7.5, height=5, units="in")
Make combined publication version of Figure 3 (accuracy on the oral dataset):
fig3 <- plot_grid(scat, proc, nrow=2, rel_heights = c(1,1), labels = c('a','b'))
fig3
ggsave(file.path(path.out, "Oral_Fig3.pdf"), fig3, width=7, height=3.6, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig3.png"), fig3, width=7, height=3.6, units="in")
Further investigation of the frequency method FP rate
Evaluate the FP rate of the frequency method under permutation of the concentrations to address reviewer concerns about the pseudo-F-test frequency classification statistic
set.seed(100)
fperm <- lapply(seq(100), function(i) {
MAPperm <- MAP2
MAPperm$Sample_or_Control <- MAP2$Sample_or_Control[sample(nrow(MAP2))]
MAPperm$quant_reading <- MAP2$quant_reading[sample(nrow(MAP2))]
ocf.perm <- isContaminant(as.matrix(OTU2), MAPperm$quant_reading, threshold = 0.1,
detailed=TRUE, normalize=TRUE, method='frequency')
ocf.perm
})
# Calculate FP rate at the ASV level
fp.asv <- Reduce("+", lapply(fperm, function(x) table(x$contaminant)))
fp.asv
# FALSE TRUE
# 83970 730
fp.asv/sum(fp.asv)
# FALSE TRUE
# 0.991381346 0.008618654
# Calculate FP rate at the Read level
fp.rd <- sum(sapply(fperm, function(x) sum(x$freq[x$contaminant])))/length(fperm)
fp.rd
# FALSE TRUE
# 0.991381346 0.008618654
# Get summary statistics of the prevalence of FPs
permdf <- do.call(rbind, fperm)
summary(permdf$prev[permdf$contaminant])
A low (sub 1%) false positive rate, almost all of which comes on very low prevalence ASVs.
Constancy of contaminant concentrations across samples
Evaluate whether the concentration of contaminants is independent of total quantified DNA, using contaminants identified by the prevalence method (which does not incorporate DNA concentration information).
##get ASVs classified as contaminants by the prevalence method at the default 0.1 threshold
contam0.1 <- rownames(ocp)[ocp$p.prev <= 0.1]
MUC_contam0.1 <- prune_taxa(contam0.1, MUC2)
#look at the 0.1 threshold data
sample_data(MUC_contam0.1)$sample_sums_contam <- sample_sums(MUC_contam0.1)
sample_data(MUC_contam0.1)$sample_sums_total <- sample_sums(MUC2)
sd <- data.frame(sample_data(MUC_contam0.1))
sd$uL_to_add[sd$uL_to_add < 0 | sd$uL_to_add > 110] <- 110 #'uL to add' is the volume of each sample that was added to the pooled DNA sample and sequenced. some samples did not have enough DNA to pool, so 110uL of those samples was added.
#total reads/uL vs. quant_reading
p <- ggplot(sd, aes(x=samp_conc, y=sample_sums_total/uL_to_add, color=Subject))
p2 <- p +
geom_point(alpha=0.7) +
labs(title='Total reads', x=expression(ng~DNA/mu*L), y=expression(Total~Reads/mu*L)) + guides(shape=FALSE, color=FALSE)
#contam reads/uL vs. quant_reading
p <- ggplot(sd, aes(x=samp_conc, y=log10(sample_sums_contam/uL_to_add), color=Subject))
p3 <- p +
geom_point(alpha=0.7) +
labs(title='Contaminant reads only', x=expression(ng~DNA/mu*L), y= expression(atop(~ Contaminant~Reads/mu*L, ~ '(log10)')))
pconst <- plot_grid(p2,p3, nrow=1, rel_widths = c(1,1.3))
pconst
ggsave(file.path(path.out, "Oral_Contaminant_Concentrations.pdf"), pconst, width=7, height=3.2, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Contaminant_Concentrations.png"), pconst, width=7, height=3.2, units="in")
Add score annotations to the 6 ASVs shown in Figure 2a
#extract the dataframe from the plot_abundance function
f2a_df <- plot_abundance(MUC,c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),
"quant_reading",taxa_are_rows=FALSE, returndf = TRUE)
#extract probability scores for the 6 taxa in Fig. 2a
TAXann_annotations <- TAXann[TAXann$Id %in% c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),c('Id','Score','Method')]
TAXann_annotations$annot <- rep(paste0(str_sub(TAXann_annotations$Method, 1, 1), ' = ', round(TAXann_annotations$Score, 3)))
TAXann_annotations$y <- rep(c(-7.25, -8.25, -9.25), each = 6)
colnames(TAXann_annotations)
## [1] "Id" "Score" "Method" "annot" "y"
colnames(TAXann_annotations)[1] <- "taxa_to_plot" #this column name must match the f2a_df column name for facet_wrap to work
taxon_levels <- c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200')
TAXann_annotations$taxa_to_plot <- factor(TAXann_annotations$taxa_to_plot, levels = taxon_levels)
#plot with annotations
p1 <- ggplot(f2a_df, aes(log(DNA_conc), log(taxon_abundance)), label=taxa_to_plot)
f2a_annot <- p1 +
geom_point() +
facet_wrap(~taxa_to_plot, nrow=1) +
geom_text(data=TAXann_annotations, aes(x=2, y=y, label=annot, hjust=0), size=2.5) #+
# theme(axis.text.x = element_text(colour="grey20",size=15,angle=0,
# hjust=.5,vjust=.5,face="plain"),
# axis.text.y = element_text(colour="grey20",size=10,angle=0,
# hjust=1,vjust=0,face="plain"),
# axis.title.x = element_text(colour="grey20",size=15,angle=0,
# hjust=.5,vjust=0,face="plain"),
# axis.title.y = element_text(colour="grey20",size=15,angle=90,
# hjust=.5,vjust=.5,face="plain"),
# strip.text.x = element_text(size = 15, angle = 0),
# legend.title = element_text(size = 15, angle = 0),
# legend.text = element_text(size = 15, angle = 0)) +
f2a_annot
fig2 <- plot_grid(f2a_annot, histo, nrow=2, rel_heights = c(0.8,1), labels = c('a','b'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fig2
ggsave(file.path(path.out, "Oral_Fig2_annot.pdf"), fig2, width=7.5, height=5, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig2_annot.png"), fig2, width=7.5, height=5, units="in")
Interactive 3D plot of the prevalence, frequency and combined scores
##plot in 3d
TAXannotate_prev2 <- TAXann[TAXann$TAXprev > 1,]
p <- plot_ly(TAXannotate_prev2, x = ~prob.f, y = ~prob.p, z = ~prob.c, color= ~TAXprev, text = ~paste0(Id, ': ', Genus, '\nPrev:', TAXprev, ', Abund: ', TAXsums)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'prob.f'),
yaxis = list(title = 'prob.p'),
zaxis = list(title = 'prob.c')))
p
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] bindrcpp_0.2.2 ComplexHeatmap_1.17.1 decontam_1.1.2
## [4] phyloseq_1.25.2 plotly_4.8.0 circlize_0.4.4
## [7] openxlsx_4.1.0 cowplot_0.9.2 knitr_1.20
## [10] dplyr_0.7.5 RColorBrewer_1.1-2 ape_5.1
## [13] gridExtra_2.3 stringr_1.3.1 vegan_2.5-2
## [16] lattice_0.20-35 permute_0.9-4 ggplot2_3.1.0
## [19] reshape2_1.4.3
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.38.0 httr_1.3.1 tidyr_0.8.1
## [4] splines_3.4.3 jsonlite_1.5 viridisLite_0.3.0
## [7] foreach_1.4.4 shiny_1.1.0 assertthat_0.2.0
## [10] stats4_3.4.3 yaml_2.1.19 pillar_1.2.3
## [13] backports_1.1.2 glue_1.2.0 digest_0.6.18
## [16] promises_1.0.1 XVector_0.18.0 colorspace_1.3-2
## [19] httpuv_1.4.3 htmltools_0.3.6 Matrix_1.2-14
## [22] plyr_1.8.4 pkgconfig_2.0.1 GetoptLong_0.1.7
## [25] zlibbioc_1.24.0 xtable_1.8-2 purrr_0.2.5
## [28] scales_1.0.0 later_0.7.2 tibble_1.4.2
## [31] mgcv_1.8-23 IRanges_2.12.0 withr_2.1.2
## [34] BiocGenerics_0.24.0 lazyeval_0.2.1 mime_0.5
## [37] survival_2.42-3 magrittr_1.5 evaluate_0.10.1
## [40] nlme_3.1-137 MASS_7.3-50 tools_3.4.3
## [43] data.table_1.11.8 GlobalOptions_0.1.0 S4Vectors_0.16.0
## [46] munsell_0.5.0 cluster_2.0.7-1 zip_1.0.0
## [49] Biostrings_2.46.0 ade4_1.7-11 compiler_3.4.3
## [52] rlang_0.3.0.1 rhdf5_2.22.0 iterators_1.0.9
## [55] biomformat_1.6.0 rjson_0.2.19 htmlwidgets_1.2
## [58] crosstalk_1.0.0 igraph_1.2.2 labeling_0.3
## [61] rmarkdown_1.9 multtest_2.34.0 gtable_0.2.0
## [64] codetools_0.2-15 R6_2.3.0 bindr_0.1.1
## [67] rprojroot_1.3-2 shape_1.4.4 stringi_1.2.2
## [70] parallel_3.4.3 Rcpp_0.12.19 tidyselect_0.2.4