Replication and refinement of a vaginal microbial signature of preterm birth in two racially distinct cohorts of US women. Callahan et al., 2017. https://doi.org/10.1073/pnas.1705899114
Load pregnancy data:
#load packages
packages <- c("decontam", "ggplot2", "reshape2", "phyloseq", "dada2", "gridExtra")
sapply(packages, require, character.only = TRUE)
## decontam ggplot2 reshape2 phyloseq dada2 gridExtra
## TRUE TRUE TRUE TRUE TRUE TRUE
#settings
theme_set(theme_bw())
options(stringsAsFactors = FALSE)
path.in <- "~/DecontamManuscript/Analyses/CallahanPTB" # CHANGE ME
path.out <- "~/DecontamManuscript/Analyses/Figures" # CHANGE ME
#color palettes
cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#000000")
methodPalette <- c(Prevalence="turquoise3", Frequency="orangered1", Combined="darkorchid4")
prevalencePalette <- c("2" = "#edf8e9", "3-5" = "#bae4b3", "6-10" = "#74c476", "11+" = "#238b45")
Coincide stn with st:
load(file.path(path.in, "preg_w_negs.rda"))
dfn$Negative <- dfn$BodySite %in% c("EC", "Mock", "NTC")
tab <- table(dfn$Run, dfn$Negative); tab # Drop Run04 as has no vaginal samples
##
## FALSE TRUE
## IL02 158 28
## IL03 237 32
## IL04 0 32
## IL05 392 29
## IL06 703 26
## IL07 114 24
## IL08 22 22
## IL09 622 29
dfn <- dfn[dfn$Run %in% rownames(tab)[rowSums(tab>0)==2],]
stn <- stn[rownames(dfn),]
ftn <- ftn[rownames(dfn),]
dfn$LibrarySize <- rowSums(stn)
ggplot(data=dfn, aes(x=BodySite, y=LibrarySize, color=BodySite)) + geom_jitter()
# Don't want to drop them though, since only doing it for positives. Not appropriate to do so in this context.
summary(rowSums(stn))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 118919 152894 143362 180773 592382
head(sort(rowSums(stn)),30) # Requiring at least 100 reads
## 2007001228 1056501118 1051001308 1056501148 1056501138 EC65G12
## 0 0 1 1 2 6
## 1056501158 1056501358 1056501258 4004901208 1060101168 NTC57
## 9 9 10 11 12 18
## 1056501248 NTC21 EC65E12 NTC64 NTC16 NTC67
## 19 19 23 31 36 37
## EC71G12 1060101228 NTC56 EC71F12 EC64G12 NTC75
## 39 41 47 58 59 60
## EC47E12 NTC60 NTC61 EC41F12 EC44G12 NTC62
## 62 67 74 77 79 82
dfn <- dfn[rowSums(stn)>=100,]
stn <- stn[rownames(stn),]
ftn <- ftn[rownames(ftn),]
#taxn <- assignTaxonomy(stn, "~/tax/silva_nr_v128_train_set.fa.gz", multithread=TRUE)
#saveRDS(taxn, file.path(path.in, "taxn.rds"))
taxn <- readRDS(file.path(path.in, "taxn.rds"))
Add quantitation data:
dfq <- read.csv(file.path(path.in, "DNA_quant_IL2_3_5-9_forBen_20170507.csv"), header=TRUE, stringsAsFactors = FALSE)
dfq <- dfq[match(rownames(dfn), dfq$specimen.barcode),]
identical(dfq$specimen.barcode, rownames(dfn)) # TRUE
## [1] TRUE
dfn$Concentration <- dfq$ng.ul
Make phyloseq object:
psn <- phyloseq(otu_table(ftn, taxa_are_rows=FALSE), sample_data(dfn), tax_table(taxn))
psnprev <- transform_sample_counts(psn, function(x) ifelse(x>0, 1, 0))
psn
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 5481 taxa and 2400 samples ]
## sample_data() Sample Data: [ 2400 samples by 64 sample variables ]
## tax_table() Taxonomy Table: [ 5481 taxa by 6 taxonomic ranks ]
Calculate decontam scores using the frequency, prevalence and combined methods.
fdf <- isContaminant(psn, conc="Concentration", method="frequency", batch="Run", normalize=FALSE)
pdf <- isContaminant(psn, neg="Negative", method="prevalence", batch="Run", normalize=FALSE)
cdf <- isContaminant(psn, conc="Concentration", neg="Negative", method="combined", batch="Run", normalize=FALSE)
contamdf <- rbind(cbind(fdf, Method="Frequency"), cbind(pdf, Method="Prevalence"), cbind(cdf, Method="Combined"))
prev <- rep(taxa_sums(psnprev), times=3)
contamdf$Prevalence <- cut(prev, c(0, 2, 5, 10, 9999), labels=c("2", "3-5", "6-10", "11+"))
Plot histogram of scores.
histo <- ggplot(data=contamdf, aes(x=p, fill=Prevalence)) +
scale_fill_manual(values=prevalencePalette) +
geom_histogram(binwidth=0.01) +
labs(x = 'decontam Score', y='Number ASVs') +
facet_wrap(~Method)
histo
## Warning: Removed 10207 rows containing non-finite values (stat_bin).
ggsave(file.path(path.out, "PTB_Score_Histogram.pdf"), histo, width=7, height=2.6, units="in", useDingbats=FALSE)
## Warning: Removed 10207 rows containing non-finite values (stat_bin).
ggsave(file.path(path.out, "PTB_Score_Histogram.png"), histo, width=7, height=2.6, units="in")
## Warning: Removed 10207 rows containing non-finite values (stat_bin).
The contaminant mode is quite clear in the combined histogram. At the 0.01-0.02 bin it starts to rise above the flat distribution, and then jumps dramatically in the 0.00-0.01 bin. That jump is almost entirely supported by high prevalence taxa for which we are most confident. The combined method also has the cleanest distribution. From this, a clear choice for a classification threshold for the combined method is P* = 0.01 (although the other methods could certainly be used as well).
Double-check threshold choice via a q-q plot.
plot(sort(runif(sum(!is.na(cdf$p)))), sort(cdf$p), log="xy")
abline(0,1,col="red")
Yep, the (strong) deviation occurs at about 0.01.
Let’s look at the consistency between frequency and prevalence scores. This makes sense to do on a run-by-run basis (where raw scores are calculated) but also after choosing the score for overall classification (min by default).
runs <- unique(dfn$Run)
# Frequency
fdf.run <- lapply(runs, function(run) {
ps <- prune_samples(sample_data(psn)$Run == run, psn)
df <- isContaminant(ps, conc="Concentration", method="frequency", normalize=FALSE)
df <- cbind(df, Run = run)
df
})
fdf.run <- do.call(rbind, fdf.run)
# Prevalence
pdf.run <- lapply(runs, function(run) {
ps <- prune_samples(sample_data(psn)$Run == run, psn)
df <- isContaminant(ps, neg="Negative", method="prevalence", normalize=FALSE)
df <- cbind(df, Run = run)
df
})
pdf.run <- do.call(rbind, pdf.run)
Density plot from per-batch scores:
densdf <- data.frame(Sequence=rownames(fdf.run), Frequency=fdf.run$freq, Prevalence=fdf.run$prev, Run=fdf.run$Run,
P.Frequency=fdf.run$p, P.Prevalence=pdf.run$p)
dens <- ggplot(data=densdf, aes(x=P.Frequency, y=P.Prevalence, size=Frequency))
dens <- dens + geom_bin2d() + scale_fill_gradient(low="white", high="black") + theme(panel.grid=element_blank())
dens <- dens + labs(x = 'Score (Frequency)', y='Score (Prevalence)') + guides(size=FALSE)
dens <- dens + theme(aspect.ratio=1)
dens
## Warning: Removed 32876 rows containing non-finite values (stat_bin2d).
Good agreement between the methods. The warning is due to NA scores assigned when ASVs are present in less than two samples in a run.
Density plot from overall scores (minimum across batches):
densdf.tot <- data.frame(Sequence=rownames(fdf), Frequency=fdf$freq, Prevalence=fdf$prev,
P.Frequency=fdf$p, P.Prevalence=pdf$p)
dens.tot <- ggplot(data=densdf.tot, aes(x=P.Frequency, y=P.Prevalence, size=Frequency))
dens.tot <- dens.tot + geom_bin2d() + scale_fill_gradient(low="white", high="black") + theme(panel.grid=element_blank())
dens.tot <- dens.tot + labs(x = 'Score (Frequency)', y='Score (Prevalence)') + guides(size=FALSE)
dens <- dens + theme(aspect.ratio=1)
dens.tot
ggsave(file.path(path.out, "PTB_Score_Density.pdf"), dens.tot, width=4, height=2.8, units="in", useDingbats=FALSE)
ggsave(file.path(path.out, "PTB_Score_Density.png"), dens.tot, width=4, height=2.8, units="in")
Very good agreement between the methods. The concentration at 0.5/0.5 is due to that being the uninformative score, that will be assigned as the minimum for many true ASVs that were minimally present in some batch.
This code was derived from the analysis workflows associated with Callahan & Digiluio et al. 2017.
Make a subject data.frame with the average frequencies for all genera from the processed data used in the manuscript:
load(file.path(path.in, "processed.rda"))
subdf <- df[!duplicated(df$SubjectID),c("SubjectID", "Cohort", "Race", "preterm", "Outcome")]
subdf$Outcome <- factor(subdf$Outcome, levels=c("Preterm","Term"))
for(sq in rownames(tax)) {
subdf[,sq] <- tapply(ft[,sq], df$SubjectID, mean)[subdf$SubjectID]
}
Test the association of the average gestational frequency of each amplicon sequence variant (ASV) with PTB:
sqs <- rownames(tax)
scores.uab <- rep(1.0, nrow(tax))
scores.stan <- rep(1.0, nrow(tax))
names(scores.stan) <- sqs; names(scores.uab) <- sqs
for(sq in rownames(tax)) {
scores.stan[sq] <- suppressWarnings(wilcox.test(subdf[subdf$Cohort %in% "Stanford" & subdf$preterm, sq],
subdf[subdf$Cohort %in% "Stanford" & !subdf$preterm, sq], alternative="greater")$p.value)
scores.uab[sq] <- suppressWarnings(wilcox.test(subdf[subdf$Cohort %in% "UAB" & subdf$preterm, sq],
subdf[subdf$Cohort %in% "UAB" & !subdf$preterm, sq], alternative="greater")$p.value)
}
taxdf <- data.frame(pStanford = scores.stan, pUAB = scores.uab, Genus=tax[, "Genus"])
taxdf$freqStanford <- apply(ft[df$Cohort %in% "Stanford",], 2, mean)
taxdf$freqUAB <- apply(ft[df$Cohort %in% "UAB",], 2, mean)
rownames(taxdf) <- sqs
taxdf$Joint <- pchisq(-2*log(taxdf$pStanford * taxdf$pUAB), df=4, lower.tail=FALSE)
taxdf$Sig <- p.adjust(taxdf$Joint, method="BH") < 0.1
taxdf$Freq <- (taxdf$freqStanford + taxdf$freqUAB)/2
sum(p.adjust(scores.stan, method="BH")<0.1); sum(p.adjust(scores.uab, method="BH")<0.1)
## [1] 43
## [1] 1
Propagate the decontam scores to the exploratory analysis data.frame:
rownames(cdf) <- colnames(stn)
taxdf$P.Contam <- cdf[rownames(taxdf), "p"]
Make exploratory plots (i.e. all variants) at the level of sequence variants, with points colored by their decontam score:
taxdf$P.Contam[is.na(taxdf$P.Contam)] <- 0.5
taxdf$Category <- cut(taxdf$P.Contam, c(0, 1e-5, 0.01, 0.1, 1))
p.sv <- ggplot(data=taxdf, aes(x=pStanford, y=pUAB, color=Category, label=Genus)) +
geom_point() +
geom_text(data=taxdf[taxdf$P.Contam < 1e-6 & (taxdf$pUAB < 0.001 | taxdf$pStanford < 0.001),], color="magenta", vjust=-0.7) +
scale_color_manual(values=c("(0,1e-05]"="magenta", "(1e-05,0.01]"="violetred2", "(0.01,0.1]"="grey", "(0.1,1]"="black"), name="decontam P") +
scale_x_log10() + scale_y_log10() +
theme_bw() + coord_fixed() + xlab("PTB Association P-value (Stanford)") + ylab("PTB Association P-value (UAB)")
p.sv
ggsave(file.path(path.out, "PTB_Fig6_Exploratory_ASVs.pdf"), p.sv, dev="pdf", width=6, height=4, units="in", useDingbats=FALSE)
The manuscript figure needs a bit of Illustrator manipulation of the text labels.
Clearly several of the ASVs significantly associated with PTB in the exploratory analysis were contaminants, as suggested in the original manuscript.
sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 dada2_1.8.0 Rcpp_0.12.17 phyloseq_1.22.3
## [5] reshape2_1.4.3 ggplot2_3.0.0 decontam_1.1.0
##
## loaded via a namespace (and not attached):
## [1] Biobase_2.38.0 jsonlite_1.5
## [3] splines_3.4.3 foreach_1.4.4
## [5] RcppParallel_4.4.0 assertthat_0.2.0
## [7] stats4_3.4.3 latticeExtra_0.6-28
## [9] Rsamtools_1.30.0 GenomeInfoDbData_1.0.0
## [11] yaml_2.1.19 pillar_1.2.3
## [13] backports_1.1.2 lattice_0.20-35
## [15] glue_1.2.0 digest_0.6.15
## [17] RColorBrewer_1.1-2 GenomicRanges_1.30.3
## [19] XVector_0.18.0 colorspace_1.3-2
## [21] htmltools_0.3.6 Matrix_1.2-14
## [23] plyr_1.8.4 pkgconfig_2.0.1
## [25] ShortRead_1.36.1 zlibbioc_1.24.0
## [27] purrr_0.2.5 scales_0.5.0
## [29] BiocParallel_1.12.0 tibble_1.4.2
## [31] mgcv_1.8-23 IRanges_2.12.0
## [33] withr_2.1.2 SummarizedExperiment_1.8.1
## [35] BiocGenerics_0.24.0 lazyeval_0.2.1
## [37] survival_2.42-3 magrittr_1.5
## [39] evaluate_0.10.1 nlme_3.1-137
## [41] MASS_7.3-50 hwriter_1.3.2
## [43] vegan_2.5-2 tools_3.4.3
## [45] data.table_1.11.4 matrixStats_0.53.1
## [47] stringr_1.3.1 S4Vectors_0.16.0
## [49] munsell_0.4.3 cluster_2.0.7-1
## [51] DelayedArray_0.4.1 bindrcpp_0.2.2
## [53] Biostrings_2.46.0 ade4_1.7-11
## [55] compiler_3.4.3 GenomeInfoDb_1.14.0
## [57] rlang_0.2.1 rhdf5_2.22.0
## [59] grid_3.4.3 RCurl_1.95-4.10
## [61] iterators_1.0.9 biomformat_1.6.0
## [63] igraph_1.2.1 labeling_0.3
## [65] bitops_1.0-6 rmarkdown_1.9
## [67] gtable_0.2.0 codetools_0.2-15
## [69] multtest_2.34.0 R6_2.2.2
## [71] GenomicAlignments_1.14.2 knitr_1.20
## [73] dplyr_0.7.5 bindr_0.1.1
## [75] rprojroot_1.3-2 permute_0.9-4
## [77] ape_5.1 stringi_1.2.2
## [79] parallel_3.4.3 tidyselect_0.2.4