Analysis of Oral Mucosal 16S dataset

The data analyzed here were generated as part of the larger study reported in “A spatial gradient of bacterial diversity in the human oral cavity shaped by salivary flow”, Proctor et al., Nature communications, 2018: https://doi.org/10.1038/s41467-018-02900-1

Prepare workspace

#load packages
cran_packages <- c("reshape2", "ggplot2", "vegan", "stringr", "gridExtra", "ape", "RColorBrewer", "dplyr", "knitr","cowplot","openxlsx", "circlize", "plotly")
bioc_packages <- c("phyloseq","decontam","ComplexHeatmap")
sapply(c(cran_packages, bioc_packages), require, character.only = TRUE)

##       reshape2        ggplot2          vegan        stringr      gridExtra 
##           TRUE           TRUE           TRUE           TRUE           TRUE 
##            ape   RColorBrewer          dplyr          knitr        cowplot 
##           TRUE           TRUE           TRUE           TRUE           TRUE 
##       openxlsx       circlize         plotly       phyloseq       decontam 
##           TRUE           TRUE           TRUE           TRUE           TRUE 
## ComplexHeatmap 
##           TRUE

#settings
theme_set(theme_bw())
options(stringsAsFactors = FALSE)
path.in <- "OralContamination/" # CHANGE ME
path.out <- "Figures/" # CHANGE ME

#color palettes
cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#000000")
methodPalette <- c(Frequency="darkorange2", Prevalence="turquoise3", Combined="darkorchid4")
prevalencePalette <- c("2" = "#edf8e9", "3-5" = "#bae4b3", "6-10" = "#74c476", "11+" = "#238b45")

#load phyloseq object
load(file.path(path.in, "hypo_sa1_0001_DADA2_ps_new.Rdata"))
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2420 taxa and 767 samples ]
## sample_data() Sample Data:       [ 767 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 2420 taxa by 6 taxonomic ranks ]

Subset Samples

There may be differences between DNA extracted in PowerSoil plates vs. tubes. For these analyses, subset to just include plate samples, and remove any barcode replicates (“R04” samples).

#remove outlier sample that was added to the DNA pool twice
MUC <- subset_samples(ps, X.SampleID != "P1104C08703R00")

#for contamination analysis, keep only samples that were extracted in 96-well plates
MUC <- subset_samples(MUC, Extraction.Protocol == "plate")

#the 'Replicate' column indicates whether the sample is a 'normal' sample (R00) or whether the sample is a type of derivative sample, e.g. 'R04' indicates the sample's DNA was PCR amplified a second time with a different primer barcode
MUC <- subset_samples(MUC, Replicate == "R00")

#remove singletons (there shouldn't be any from DADA2; only singletons might be strays after subsetting to plate samples)
MUC <- prune_taxa(taxa_sums(MUC) > 1, MUC)
MUC <- prune_samples(sample_sums(MUC) > 0, MUC)

#compare unfiltered data to plate-extracted, non-barcode replicate samples that lack singletons.
ps

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2420 taxa and 767 samples ]
## sample_data() Sample Data:       [ 767 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 2420 taxa by 6 taxonomic ranks ]

MUC

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1951 taxa and 569 samples ]
## sample_data() Sample Data:       [ 569 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 1951 taxa by 6 taxonomic ranks ]

MUCprev <- transform_sample_counts(MUC, function(x) ifelse(x>0, 1, 0))
MUCrelabund <- transform_sample_counts(MUC, function(x) x/sum(x))

Extract OTU table and sample metadata as base R data.frames

#get sample sums: each sample's sequencing depth
sample_data(MUC)$sample_sums <- sample_sums(MUC)
MAP <- data.frame(sample_data(MUC))
OTU <- data.frame(as(otu_table(MUC), 'matrix'))
# TAX <- data.frame(tax_table(MUC)) # These older assignments will be replaced by Silva v128 assignments

Load annotated taxonomy table

This includes the DNA sequences, HOMD taxonomic assignments, and by-hand resolutions of discrepancies between Silva and HOMD.

TAXannotate <- read.csv(file.path(path.in, "TAXannotate.csv"))
rownames(TAXannotate) <- TAXannotate$Id
TAXannotate <- TAXannotate[taxa_names(MUC),] # subset down to the taxa being analyzed

Assign taxonomy using SILVA v128

The stored Silva assignments used an older version of the database, so we rerun taxonomic assignment using Silva v128 and dada2’s assignTaxonomy function.

# Function to simplify Silva assignments for consistency with genus reporting in literature
parse_silva_genus <- Vectorize(function(gen) {
  gen <- gsub("\\[", "", gen)
  gen <- gsub("\\]", "", gen)
  gen <- gsub("Escherichia/Shigella", "Escherichia", gen)
  gen <- gsub("Burkholderia-Paraburkholderia", "Burkholderia", gen)
  gen <- gsub("Burkholderia-Caballeronia-Paraburkholderia", "Burkholderia", gen)
  gen <- gsub("_.*$", "", gen)
  if(is.na(gen)) gen <- "unknown"
  gen
})
# Uncomment the following to run w/o the taxtabSilva128.rds file
# set.seed(100)
# library(dada2); packageVersion("dada2")
# t128 <- assignTaxonomy(TAXannotate$Sequence, "~/tax/silva_nr_v128_train_set.fa.gz", multithread=TRUE)
# t128[,"Genus"] <- parse_silva_genus(t128[,"Genus"])
# saveRDS(t128, file.path(path.in, "t128.rds"))
t128 <- readRDS(file.path(path.in, "taxtabSilva128.rds"))
TAXannotate[,colnames(t128)] <- t128
TAXannotate[,"GenusSILVA"] <- t128[,"Genus"]

Augment the annotated taxonomy table.

This includes includes assigning the recommended Genus column, as decided by hand between Silva and HOMD assignments:

# 'GenusREC' column will be the recommended taxonomy, based on Taxonomy_recommendation notes.
# assign SILVA to anything that's 'both'
# assign 'neither' to anything that's in one sample or has no recommendation from SILVA or HOMD
unique(TAXannotate$Taxonomy_recommendation)

## [1] "GenusSILVA"             "neither at genus level"
## [3] "GenusHOMD"              "only in one sample"    
## [5] "both"

# Values are: "only in one sample", "GenusSILVA", "GenusHOMD", "both", "neither at genus level"
TAXannotate$GenusREC <- "neither" # Default is unassigned
use.silva <- TAXannotate$Taxonomy_recommendation %in% c("GenusSILVA", "both")
TAXannotate$GenusREC[use.silva] <- TAXannotate$GenusSILVA[use.silva] # Assign Silva genus
use.homd <- TAXannotate$Taxonomy_recommendation %in% c("GenusHOMD")
TAXannotate$GenusREC[use.homd] <- TAXannotate$GenusHOMD[use.homd] # Assign HOMD genus
# Augment with abundance and prevalence information
TAXannotate$TAXsums <- taxa_sums(MUC)[TAXannotate$Id] # Add total abundances
TAXannotate$TAXprev <- taxa_sums(MUCprev)[TAXannotate$Id] # Add prevalences

Filter Taxa

Apply prevalence filter to filter out taxa present in fewer than 2 samples. This is a very lenient filter.

MUC2 <- filter_taxa(MUC, function(x) sum(x > 0) > 1, TRUE)
OTU2 <- OTU[,taxa_names(MUC2)]
TAX2 <- TAXannotate[taxa_names(MUC2),]
MAP2 <- MAP[sample_names(MUC2),]

Plot Frequency Patterns

Create a plot showing examples of the two types of frequency patterns observed in the data: frequency independent of DNA concentration and inversely proportional to it (Figure 2a)

#plot_abundance function for figures (similar to the plot_frequency function in package)
plot_abundance <- function(ps, taxa_to_plot, conc, taxa_are_rows=TRUE, norm=TRUE, log=TRUE, returndf=FALSE){

  ot <- as(otu_table(ps), "matrix")
    if(taxa_are_rows){
    ot <- t(ot)
  }
  
  taxa_mismatch <- !(taxa_names(ps) %in% colnames(ot))
  if(sum(taxa_mismatch) > 0){
    stop("Error: 'taxa_are_rows' argument may not be correct")
  }
  
  if(norm){
  ot <- sweep(ot, 1, rowSums(ot), "/")
  }
  
  ot <- as(ot[,colnames(ot) %in% taxa_to_plot], "matrix")
  if(dim(ot)[2] == 1){
    colnames(ot) <- taxa_to_plot
  }
    
  st <- as(sample_data(ps), "data.frame")
  snames <- sample_names(ps)
                   
  plot <- merge(st, ot, by.x = "row.names", by.y = "row.names", sort=FALSE)
  plot_melt <- melt(plot, id.vars = colnames(plot)[1:(dim(plot)[2]-length(taxa_to_plot))])

  colnames(plot_melt)[dim(plot_melt)[2]-1] <- "taxa_to_plot"
  colnames(plot_melt)[dim(plot_melt)[2]] <- "taxon_abundance"
  
  taxon_levels <- taxa_to_plot
  plot_melt$taxa_to_plot <- factor(plot_melt$taxa_to_plot, levels = taxon_levels)
  
  I <- which(colnames(plot_melt) == conc)
  colnames(plot_melt)[I] <- "DNA_conc"
  
  if(returndf == FALSE){
  if(log==TRUE){
    p1 <- ggplot(plot_melt, aes(log(DNA_conc),log(taxon_abundance)))
    return(p1 + geom_point())
  } else if(log==FALSE){
    p1 <- ggplot(plot_melt, aes(DNA_conc,taxon_abundance))
    return(p1 + geom_point())
  }
  } else if(returndf == TRUE){
    return(plot_melt)
  }
}
#Figure 2a
f2a <- plot_abundance(MUC,c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),
                      "quant_reading",taxa_are_rows=FALSE)
f2a <- f2a + facet_wrap(~taxa_to_plot,nrow=1) + 
      labs(x= 'log(DNA concentration)', y = 'log(Frequency)') + 
#      geom_point(aes(color=Sample_or_Control)) +
      geom_point() +
#      theme(axis.text.x = element_text(colour="grey20",size=15,angle=0,
#                                       hjust=.5,vjust=.5,face="plain"),
#            axis.text.y = element_text(colour="grey20",size=10,angle=0,
#                                       hjust=1,vjust=0,face="plain"),  
#            axis.title.x = element_text(colour="grey20",size=15,angle=0,
#                                        hjust=.5,vjust=0,face="plain"),
#            axis.title.y = element_text(colour="grey20",size=15,angle=90,
#                                        hjust=.5,vjust=.5,face="plain"),
#            strip.text.x = element_text(size = 15, angle = 0),
#            legend.title = element_text(size = 15, angle = 0),
#            legend.text = element_text(size = 15, angle = 0)) +
      theme(legend.position = 'top')

f2a

Classify contaminants

Using both the frequency and prevalence methods, and also the combined method that classifies based on a score that is a composite of the frequency and pevalence scores.

conc2 <- MAP2$quant_reading # PicoGreen fluroescent intensity
neg2 <- MAP2$Sample_or_Control == 'Control Sample' # True if a negative control sample
# Frequency-based contaminant classification
ocf <- isContaminant(as.matrix(OTU2), conc=conc2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='frequency')
# Prevalence-based contaminant classification
ocp <- isContaminant(as.matrix(OTU2), neg=neg2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='prevalence')
# Combined contaminant classification
occ <- isContaminant(as.matrix(OTU2), conc=conc2, neg=neg2, threshold=0.1, detailed=TRUE, normalize=TRUE, method='combined')

Combine scores from each method into a data.frame, and add to TAX2.

probcols <- data.frame(row.names=rownames(ocf),prob.f=ocf$p.freq, prob.p=ocp$p.prev, prob.c = occ$p)
TAX2 <- cbind(TAX2[,colnames(TAXannotate)], probcols[TAX2$Id,])
# Crude comparison of frequency and prevalence contaminant assignment
table(probcols$prob.f<0.1, probcols$prob.c<0.1)

##        
##         FALSE TRUE
##   FALSE   711   12
##   TRUE      8   44

Solid agreement.

Reference classification

Reference databases were generated for each genus from the HOMD database, and by literatures search for cultivated oral taxa and reported contaminant genera. Load the reference database files & organize genera from the reference databases.

contam_database <- read.csv(file.path(path.in, "contamination_database.csv"))
oral_database <- read.csv(file.path(path.in, "oral_database.csv"))
HOMD <- read.csv(file.path(path.in, "homd_taxonomy_table.csv"))

### Oral genera
#genera that have been visualized in the mouth.
oral_visualized <- unique(oral_database$Genus)

#genera that have been named & cultivated [BJC: removed not visualized condition]
oral_cultivated <- unique(HOMD$Genus[HOMD$Status %in% c('Named','Unnamed')])
#oral_cultivated <- oral_cultivated[!(oral_cultivated %in% oral_visualized)]

# All genera detected in the mouth
oral_all <- unique(c(oral_visualized, oral_cultivated))

### Contaminant genera
#genera that have been identified as contaminants in previous studies in the literature.
contaminatinggenera <- contam_database %>%
  group_by(Genus) %>%
  mutate(numberstudies = length(unique(Reference)))

## Warning: package 'bindrcpp' was built under R version 3.4.4

contam_df <- as.data.frame(contaminatinggenera)

#genera that have been identified as contaminants in more than one previous study
contam_multiple <- unique(contaminatinggenera$Genus[contaminatinggenera$numberstudies > 1])
contam_multiple <- contam_multiple[contam_multiple != "unspecified"]  #68 of these

#genera that have been identified as contaminants in at least one previous study
contam_one <- unique(contaminatinggenera$Genus[contaminatinggenera$Genus != 'unspecified']) #213 of these

###BJC: Lists of genera w/ short names
oral_cult <- unique(HOMD$Genus[HOMD$Status %in% c('Named','Unnamed')])
oral_vis <- unique(oral_database$Genus)
oral <- unique(c(oral_cult, oral_vis))
contam <- unique(contam_df$Genus)
contam2 <- unique(contam_df$Genus[contam_df$numberstudies>1])

Categorize taxa according to reference databases

tt <- TAX2 # Working version of TAX2
tt$Genus <- tt$GenusREC
### Categorize
# Oral, if in oral db and not in contam db. 
# Contaminant, if reported at least twice as a contaminant, and not in oral db.
# Ambiguous otherwise.
tt$Group <- "Ambiguous" # Default
is.oral <- tt$Genus %in% oral & !tt$Genus %in% contam
tt$Group[is.oral] <- "Oral"
is.contam <- tt$Genus %in% contam & !tt$Genus %in% oral
tt$Group[is.contam] <- "Contaminant"
# Augment with prevalence groups and Abundance column
tt$Prevalence <- cut(tt$TAXprev, c(0, 2, 5, 10, 9999), labels=c("2", "3-5", "6-10", "11+"))
tt$Abundance <- tt$TAXsums

Inspect categories.

table(tt$Group)

## 
##   Ambiguous Contaminant        Oral 
##         696          54          97

tapply(tt$Abundance, tt$Group, sum)/sum(sum(tt$TAXsums))

##   Ambiguous Contaminant        Oral 
## 0.937759276 0.006586119 0.055654605

top.genera <- head(names(sort(tapply(tt$Abundance, tt$Genus, sum), decreasing=TRUE)))
in.both <- top.genera %in% oral & top.genera %in% contam
names(in.both) <- top.genera
in.both

## Streptococcus   Haemophilus     Neisseria    Prevotella       Gemella 
##          TRUE          TRUE          TRUE          TRUE          TRUE 
##        Rothia 
##          TRUE

A minority of taxa and reads are being unambiguously assigned, but this is the most useful fraction for assessing classification accuracy, and it is still a non-trival number (~150/850 taxa, ~6% of total reads). Fewer reads are being assigned because the abundant genera (e.g. Steptococcus) are expansive, and unsurprisingly show up in both the oral and contaminant databases.

Plot score distribution on the oral dataset.

This is Figure 2b, with full histogram for prevalence, frequency and combined methods.

tt.class <- tt[!is.na(tt$prob.c),] # 775 classified by decontam, aka "All ASVs"
tt.class.contam <- tt.class[tt.class$Group == "Contaminant",]
tt.class.oral <- tt.class[tt.class$Group == "Oral",]
TAXann <- rbind(cbind(tt.class, Score=tt.class$prob.f, Method="Frequency"),
                cbind(tt.class, Score=tt.class$prob.p, Method="Prevalence"),
                cbind(tt.class, Score=tt.class$prob.c, Method="Combined"))

TAXann$Method <- factor(TAXann$Method, levels=c('Frequency','Prevalence','Combined'))

#figure 2b
histo <- ggplot(TAXann, aes(x=Score, fill=Prevalence))
histo <- histo + geom_histogram() + labs(x = 'decontam Score', y='Number ASVs') + 
  facet_wrap(~Method, nrow=1) +
  #http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=4
  scale_fill_manual(values=prevalencePalette) +
#  theme(axis.text.x = element_text(colour="grey20",size=10,angle=0,
#                                    hjust=.5,vjust=.5,face="plain"),
#        axis.text.y = element_text(colour="grey20",size=10,angle=0,
#                                   hjust=1,vjust=0,face="plain"),  
#        axis.title.x = element_text(colour="grey20",size=15,angle=0,
#                                    hjust=.5,vjust=0,face="plain"),
#        axis.title.y = element_text(colour="grey20",size=15,angle=90,
#                                    hjust=.5,vjust=.5,face="plain"),
#        strip.text.x = element_text(size = 15, angle = 0),
#        legend.title = element_text(size = 15, angle = 0),
#        legend.text = element_text(size = 10, angle = 0)) +
  theme(legend.position = "bottom")
histo

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###ggsave(file.path(path.out, "Oral_Score_Histogram.pdf"), histo, width=7, height=2.6, units="in", useDingbats=FALSE)
###ggsave(file.path(path.out, "Oral_Score_Histogram.png"), histo, width=7, height=2.6, units="in")

Plot score distributions for difference reference categories

Supplementary figure showing score histograms for genera that are ambiguous by reference classification.

histo.class <- ggplot(data=TAXann, aes(x=Score, fill=Prevalence)) +
  geom_histogram() + labs(x = 'decontam Score', y='Number ASVs') + 
  facet_grid(Group~Method, scales="free_y") +
  scale_fill_manual(values=prevalencePalette) +
  theme(legend.position = "bottom")
histo.class

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(file.path(path.out, "Oral_Score_Histogram_byReferenceClassification.pdf"), histo.class, width=7, height=4.8, units="in", useDingbats=FALSE)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(file.path(path.out, "Oral_Score_Histogram_byReferenceClassification.png"), histo.class, width=7, height=4.8, units="in")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot score distribution on the oral dataset, weighted by abundance.

This is the Supplementary Figure with full histogram weighted by abundance for the prevalence, frequency and combined methods. This is the abundance-weighted version of Figure 2.

histo.wt <- histo + aes(weight=Abundance)
histo.wt

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

histo.wt.log <- histo.wt + scale_y_log10(name="Reads") + 
                aes(fill=Method) + scale_fill_manual(values=methodPalette)

## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.

# Not a correct histogram if fill specified, as logged values are stacked inappropriately
histo.wt.log

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 7 rows containing missing values (geom_bar).

###ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted.pdf"), histo.wt, width=7, height=2.6, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted_logY.pdf"), histo.wt.log, width=7, height=2.6, units="in", useDingbats=FALSE)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 7 rows containing missing values (geom_bar).

ggsave(file.path(path.out, "Oral_Score_Histogram_Weighted_logY.png"), histo.wt.log, width=7, height=2.6, units="in")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Transformation introduced infinite values in continuous y-axis

## Warning: Removed 7 rows containing missing values (geom_bar).

Pub quality figure (w/ Ambiguous in background):

scat <- ggplot(data=tt[tt$Group != "Ambiguous",], aes(x=prob.f, y=prob.p, color=Group, size=Abundance))
scat <- scat + scale_colour_manual(values = c("Oral"="blue", "Contaminant"="red", "Ambiguous"="gray80"), name="Reference\nClassification")
scat <- scat + geom_point(data=tt[tt$Group == "Ambiguous",], alpha=0.3)
scat <- scat + geom_point(alpha=0.3)
scat <- scat + labs(x = 'Score (Frequency)', y='Score (Prevalence)') + guides(size=FALSE)
scat <- scat + facet_grid(~Prevalence)
scat <- scat + guides(colour = guide_legend(override.aes = list(alpha = 1)))
scat <- scat + scale_x_continuous(breaks=c(0,0.25,0.5,0.75,1), labels=c("0", "0.25", "0.5", "0.75", "1"))
scat <- scat + scale_y_continuous(breaks=c(0,0.25,0.5,0.75,1), labels=c("0", "0.25", "0.5", "0.75", "1"))
scat

###ggsave(file.path(path.out, "Oral_Scores_Scatter.pdf"), scat, width=10, height=2.8, units="in", useDingbats=FALSE)

Calculate and plot the ROC for each classification method:

cc <- tt[tt$Group != "Ambiguous",] # Just reference classified taxa
get.roc <- function(df, score, weight=FALSE, thresh=0.1) {
  ord <- order(df[,score], na.last=NA)
  df <- df[ord,]
  if(weight) {
    xx <- c(0, cumsum(df$Abundance*(df$Group=="Oral"))/sum(df$Abundance*(df$Group=="Oral")))
    yy <- c(0, cumsum(df$Abundance*(df$Group=="Contaminant"))/sum(df$Abundance*(df$Group=="Contaminant")))
  } else {
    xx <- c(0, cumsum(df$Group=="Oral")/sum(df$Group=="Oral"))
    yy <- c(0, cumsum(df$Group=="Contaminant")/sum(df$Group=="Contaminant"))
  }
  i.thresh <- min(which(df[,score]>thresh))
  ii <- seq_along(xx) == i.thresh
  return(data.frame(InverseSpecificity=xx, Sensitivity=yy, Threshold=ii)) 
  # InverseSpecificity = 1-Specificity
}
rocdf <- rbind(cbind(get.roc(cc, "prob.f"), Method="Frequency", Measure="ASVs"),
               cbind(get.roc(cc, "prob.p"), Method="Prevalence", Measure="ASVs"),
               cbind(get.roc(cc, "prob.c"), Method="Combined", Measure="ASVs"))
rocdf$Method <- factor(rocdf$Method, levels=c("Frequency", "Prevalence", "Combined"))

arocdf <- rbind(cbind(get.roc(cc, "prob.f", weight=TRUE), Method="Frequency", Measure="Reads"),
                cbind(get.roc(cc, "prob.p", weight=TRUE), Method="Prevalence", Measure="Reads"),
                cbind(get.roc(cc, "prob.c", weight=TRUE), Method="Combined", Measure="Reads"))
arocdf$Method <- factor(arocdf$Method, levels=c("Frequency", "Prevalence", "Combined"))

proc <- ggplot(data=rbind(rocdf, arocdf), aes(x=InverseSpecificity, y=Sensitivity, color=Method)) + 
        geom_line() + geom_abline(slope=1,intercept=0,col="black",linetype="dashed") +
        facet_wrap(~Measure) +
        scale_x_continuous(breaks=c(0, 0.25, 0.5, 0.75, 1), labels=c(1, 0.75, 0.5, 0.25, 0)) +
        xlab("Specificity") + # Previous scale converts from 1-Specificity to Specificity
        geom_point(aes(size=Threshold)) + 
        scale_color_manual(values=methodPalette) +
        scale_size_manual(values=c("FALSE"=0, "TRUE"=2)) + guides(size=FALSE)
proc

###ggsave(file.path(path.out, "Oral_ROC.pdf"), proc, width=6, height=2.4, units="in", useDingbats=FALSE)
rocdf[rocdf$Threshold,] # InverseSpecificity here is 1-Specificity

##     InverseSpecificity Sensitivity Threshold     Method Measure
## 19          0.00000000   0.3333333      TRUE  Frequency    ASVs
## 191         0.05154639   0.6111111      TRUE Prevalence    ASVs
## 321         0.01075269   0.4545455      TRUE   Combined    ASVs

arocdf[arocdf$Threshold,]

##     InverseSpecificity Sensitivity Threshold     Method Measure
## 19        0.000000e+00   0.9706820      TRUE  Frequency   Reads
## 191       6.409174e-04   0.7689587      TRUE Prevalence   Reads
## 321       7.552549e-05   0.9796471      TRUE   Combined   Reads

1-arocdf[arocdf$Threshold,"InverseSpecificity"]

## [1] 1.0000000 0.9993591 0.9999245

TWITTER REQUEST: Calculate and plot the precision-recall curve for each classification method:

get.pr <- function(df, score, weight=FALSE, thresh=0.1) {
  ord <- order(df[,score], na.last=NA)
  df <- df[ord,]
  if(weight) {
    tp <- cumsum(df$Abundance*(df$Group=="Contaminant"))
    fp <- cumsum(df$Abundance*(df$Group=="Oral"))
    totp <- sum(df$Abundance * (df$Group=="Contaminant"))
  } else {
    tp <- cumsum(df$Group=="Contaminant")
    fp <- cumsum(df$Group=="Oral")
    totp <- sum(df$Group=="Contaminant")
  }
  i.thresh <- min(which(df[,score]>thresh))
  ii <- seq_along(tp) == i.thresh
  return(data.frame(Precision=tp/(tp+fp), Recall=tp/totp, Threshold=ii)) 
}
prdf <- rbind(cbind(get.pr(cc, "prob.f"), Method="Frequency", Measure="ASVs"),
              cbind(get.pr(cc, "prob.p"), Method="Prevalence", Measure="ASVs"),
              cbind(get.pr(cc, "prob.c"), Method="Combined", Measure="ASVs"))
prdf$Method <- factor(prdf$Method, levels=c("Frequency", "Prevalence", "Combined"))

aprdf <- rbind(cbind(get.pr(cc, "prob.f", weight=TRUE), Method="Frequency", Measure="Reads"),
               cbind(get.pr(cc, "prob.p", weight=TRUE), Method="Prevalence", Measure="Reads"),
               cbind(get.pr(cc, "prob.c", weight=TRUE), Method="Combined", Measure="Reads"))
aprdf$Method <- factor(aprdf$Method, levels=c("Frequency", "Prevalence", "Combined"))

ppr <- ggplot(data=rbind(aprdf), aes(x=Recall, y=Precision, color=Method)) + 
       geom_line() + 
       facet_wrap(~Measure) +
       xlab("Recall") + # Previous scale converts from 1-Specificity to Specificity
       geom_point(aes(size=Threshold)) + 
       scale_color_manual(values=methodPalette) +
       scale_size_manual(values=c("FALSE"=0, "TRUE"=2)) + guides(size=FALSE)
ppr

###ggsave(file.path(path.out, "Oral_ROC.pdf"), proc, width=6, height=2.4, units="in", useDingbats=FALSE)
prdf[prdf$Threshold,] # InverseSpecificity here is 1-Specificity

##     Precision    Recall Threshold     Method Measure
## 19  0.9473684 0.3333333      TRUE  Frequency    ASVs
## 190 0.8717949 0.6296296      TRUE Prevalence    ASVs
## 319 0.9411765 0.4848485      TRUE   Combined    ASVs

aprdf[aprdf$Threshold,]

##     Precision    Recall Threshold     Method Measure
## 19  0.9998566 0.9706820      TRUE  Frequency   Reads
## 190 0.9930102 0.7694226      TRUE Prevalence   Reads
## 319 0.9993429 0.9800923      TRUE   Combined   Reads

Investigate the most glaring seeming misclassifications.

The seeming FN in the 11+ group (the red point in the upper right of that panel):

upright.11 <- tt$prob.f > 0.5 & tt$prob.p > 0.5 & tt$TAXprev > 10; sum(upright.11)

## [1] 327

i.fn.11 <- which(tt$Group %in% "Contaminant" & upright.11)
tt[i.fn.11,"Genus"] # [1] "Peptococcus"

## [1] "Peptococcus"

Peptococcus is a known inhabitant of the human mouth. Just a reference db error.

The seeming FP in the 6-10 group (the blue point in the lower left of that panel, none in 11+):

lowleft.6 <- tt$prob.f < 0.5 & tt$prob.p < 0.5 & tt$TAXprev >= 6; sum(lowleft.6)

## [1] 49

i.fp.6 <- which(tt$Group %in% "Oral" & lowleft.6)
tt[i.fp.6,"Genus"] # [1] "Moraxella"    "Enterobacter" "Delftia"

## [1] "Mycobacterium"

Mycobacterium certainly colonizes humans, but it also well known as a laboratory contaminant despite not appearing in our contamination database.

Main Text Figures

Make combined publication version of Figure 2 (Frequency patterns and score distributions on oral dataset):

fig2 <- plot_grid(f2a, histo, nrow=2, rel_heights = c(0.8,1), labels = c('a','b'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fig2

ggsave(file.path(path.out, "Oral_Fig2.pdf"), fig2, width=7.5, height=5, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig2.png"), fig2, width=7.5, height=5, units="in")

Make combined publication version of Figure 3 (accuracy on the oral dataset):

fig3 <- plot_grid(scat, proc, nrow=2, rel_heights = c(1,1), labels = c('a','b'))
fig3

ggsave(file.path(path.out, "Oral_Fig3.pdf"), fig3, width=7, height=3.6, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig3.png"), fig3, width=7, height=3.6, units="in")

Further investigation of the frequency method FP rate

Evaluate the FP rate of the frequency method under permutation of the concentrations to address reviewer concerns about the pseudo-F-test frequency classification statistic

set.seed(100)
fperm <- lapply(seq(100), function(i) {
  MAPperm <- MAP2
  MAPperm$Sample_or_Control <- MAP2$Sample_or_Control[sample(nrow(MAP2))]
  MAPperm$quant_reading <- MAP2$quant_reading[sample(nrow(MAP2))]
  ocf.perm <- isContaminant(as.matrix(OTU2), MAPperm$quant_reading, threshold = 0.1, 
                            detailed=TRUE, normalize=TRUE, method='frequency')
  ocf.perm
})
# Calculate FP rate at the ASV level
fp.asv <- Reduce("+", lapply(fperm, function(x) table(x$contaminant)))
fp.asv
# FALSE  TRUE 
# 83970   730 
fp.asv/sum(fp.asv)
#       FALSE        TRUE 
# 0.991381346 0.008618654
# Calculate FP rate at the Read level
fp.rd <- sum(sapply(fperm, function(x) sum(x$freq[x$contaminant])))/length(fperm)
fp.rd
#       FALSE        TRUE 
# 0.991381346 0.008618654
# Get summary statistics of the prevalence of FPs
permdf <- do.call(rbind, fperm)
summary(permdf$prev[permdf$contaminant])

A low (sub 1%) false positive rate, almost all of which comes on very low prevalence ASVs.

Constancy of contaminant concentrations across samples

Evaluate whether the concentration of contaminants is independent of total quantified DNA, using contaminants identified by the prevalence method (which does not incorporate DNA concentration information).

##get ASVs classified as contaminants by the prevalence method at the default 0.1 threshold
contam0.1 <- rownames(ocp)[ocp$p.prev <= 0.1]
MUC_contam0.1 <- prune_taxa(contam0.1, MUC2)

#look at the 0.1 threshold data
sample_data(MUC_contam0.1)$sample_sums_contam <- sample_sums(MUC_contam0.1)
sample_data(MUC_contam0.1)$sample_sums_total <- sample_sums(MUC2)
sd <- data.frame(sample_data(MUC_contam0.1))
sd$uL_to_add[sd$uL_to_add < 0 | sd$uL_to_add > 110] <- 110 #'uL to add' is the volume of each sample that was added to the pooled DNA sample and sequenced. some samples did not have enough DNA to pool, so 110uL of those samples was added.

#total reads/uL vs. quant_reading
p <- ggplot(sd, aes(x=samp_conc, y=sample_sums_total/uL_to_add, color=Subject))
p2 <- p + 
  geom_point(alpha=0.7) + 
  labs(title='Total reads', x=expression(ng~DNA/mu*L), y=expression(Total~Reads/mu*L)) + guides(shape=FALSE, color=FALSE)

#contam reads/uL vs. quant_reading
p <- ggplot(sd, aes(x=samp_conc, y=log10(sample_sums_contam/uL_to_add), color=Subject))
p3 <- p + 
  geom_point(alpha=0.7) + 
  labs(title='Contaminant reads only', x=expression(ng~DNA/mu*L), y= expression(atop(~ Contaminant~Reads/mu*L, ~ '(log10)')))

pconst <- plot_grid(p2,p3, nrow=1, rel_widths = c(1,1.3))
pconst

ggsave(file.path(path.out, "Oral_Contaminant_Concentrations.pdf"), pconst, width=7, height=3.2, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Contaminant_Concentrations.png"), pconst, width=7, height=3.2, units="in")

Add score annotations to the 6 ASVs shown in Figure 2a

#extract the dataframe from the plot_abundance function
f2a_df <- plot_abundance(MUC,c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),
                      "quant_reading",taxa_are_rows=FALSE, returndf = TRUE)

#extract probability scores for the 6 taxa in Fig. 2a
TAXann_annotations <- TAXann[TAXann$Id %in% c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200'),c('Id','Score','Method')]
TAXann_annotations$annot <- rep(paste0(str_sub(TAXann_annotations$Method, 1, 1), ' = ', round(TAXann_annotations$Score, 3)))
TAXann_annotations$y <- rep(c(-7.25, -8.25, -9.25), each = 6)
colnames(TAXann_annotations)

## [1] "Id"     "Score"  "Method" "annot"  "y"

colnames(TAXann_annotations)[1] <- "taxa_to_plot" #this column name must match the f2a_df column name for facet_wrap to work
taxon_levels <- c('Seq3','Seq53','Seq152','Seq1','Seq12','Seq200')
TAXann_annotations$taxa_to_plot <- factor(TAXann_annotations$taxa_to_plot, levels = taxon_levels)

#plot with annotations
p1 <- ggplot(f2a_df, aes(log(DNA_conc), log(taxon_abundance)), label=taxa_to_plot)
f2a_annot <- p1 + 
  geom_point() + 
  facet_wrap(~taxa_to_plot, nrow=1) + 
  geom_text(data=TAXann_annotations, aes(x=2, y=y, label=annot, hjust=0), size=2.5) #+
  #      theme(axis.text.x = element_text(colour="grey20",size=15,angle=0,
#                                       hjust=.5,vjust=.5,face="plain"),
#            axis.text.y = element_text(colour="grey20",size=10,angle=0,
#                                       hjust=1,vjust=0,face="plain"),  
#            axis.title.x = element_text(colour="grey20",size=15,angle=0,
#                                        hjust=.5,vjust=0,face="plain"),
#            axis.title.y = element_text(colour="grey20",size=15,angle=90,
#                                        hjust=.5,vjust=.5,face="plain"),
#            strip.text.x = element_text(size = 15, angle = 0),
#            legend.title = element_text(size = 15, angle = 0),
#            legend.text = element_text(size = 15, angle = 0)) +

f2a_annot

fig2 <- plot_grid(f2a_annot, histo, nrow=2, rel_heights = c(0.8,1), labels = c('a','b'))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

fig2

ggsave(file.path(path.out, "Oral_Fig2_annot.pdf"), fig2, width=7.5, height=5, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "Oral_Fig2_annot.png"), fig2, width=7.5, height=5, units="in")

Interactive 3D plot of the prevalence, frequency and combined scores

##plot in 3d
TAXannotate_prev2 <- TAXann[TAXann$TAXprev > 1,]

p <- plot_ly(TAXannotate_prev2, x = ~prob.f, y = ~prob.p, z = ~prob.c, color= ~TAXprev, text = ~paste0(Id, ': ', Genus, '\nPrev:', TAXprev, ', Abund: ', TAXsums)) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'prob.f'),
                     yaxis = list(title = 'prob.p'),
                     zaxis = list(title = 'prob.c')))

p

sessionInfo()

## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] bindrcpp_0.2.2        ComplexHeatmap_1.17.1 decontam_1.1.2       
##  [4] phyloseq_1.25.2       plotly_4.8.0          circlize_0.4.4       
##  [7] openxlsx_4.1.0        cowplot_0.9.2         knitr_1.20           
## [10] dplyr_0.7.5           RColorBrewer_1.1-2    ape_5.1              
## [13] gridExtra_2.3         stringr_1.3.1         vegan_2.5-2          
## [16] lattice_0.20-35       permute_0.9-4         ggplot2_3.1.0        
## [19] reshape2_1.4.3       
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.38.0      httr_1.3.1          tidyr_0.8.1        
##  [4] splines_3.4.3       jsonlite_1.5        viridisLite_0.3.0  
##  [7] foreach_1.4.4       shiny_1.1.0         assertthat_0.2.0   
## [10] stats4_3.4.3        yaml_2.1.19         pillar_1.2.3       
## [13] backports_1.1.2     glue_1.2.0          digest_0.6.18      
## [16] promises_1.0.1      XVector_0.18.0      colorspace_1.3-2   
## [19] httpuv_1.4.3        htmltools_0.3.6     Matrix_1.2-14      
## [22] plyr_1.8.4          pkgconfig_2.0.1     GetoptLong_0.1.7   
## [25] zlibbioc_1.24.0     xtable_1.8-2        purrr_0.2.5        
## [28] scales_1.0.0        later_0.7.2         tibble_1.4.2       
## [31] mgcv_1.8-23         IRanges_2.12.0      withr_2.1.2        
## [34] BiocGenerics_0.24.0 lazyeval_0.2.1      mime_0.5           
## [37] survival_2.42-3     magrittr_1.5        evaluate_0.10.1    
## [40] nlme_3.1-137        MASS_7.3-50         tools_3.4.3        
## [43] data.table_1.11.8   GlobalOptions_0.1.0 S4Vectors_0.16.0   
## [46] munsell_0.5.0       cluster_2.0.7-1     zip_1.0.0          
## [49] Biostrings_2.46.0   ade4_1.7-11         compiler_3.4.3     
## [52] rlang_0.3.0.1       rhdf5_2.22.0        iterators_1.0.9    
## [55] biomformat_1.6.0    rjson_0.2.19        htmlwidgets_1.2    
## [58] crosstalk_1.0.0     igraph_1.2.2        labeling_0.3       
## [61] rmarkdown_1.9       multtest_2.34.0     gtable_0.2.0       
## [64] codetools_0.2-15    R6_2.3.0            bindr_0.1.1        
## [67] rprojroot_1.3-2     shape_1.4.4         stringi_1.2.2      
## [70] parallel_3.4.3      Rcpp_0.12.19        tidyselect_0.2.4