Load packages.
library(biomformat); packageVersion("biomformat")
## [1] '1.18.0'
library(dada2); packageVersion("dada2")
## Loading required package: Rcpp
## [1] '1.19.2'
library(decontam); packageVersion("decontam")
## [1] '1.11.1'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.3.5'
Read in the microbiome profile data.
fn.biom <- "../Data/feature-table.biom" # CHANGE ME IF working directory is different than Rmd location
biom.hash <- read_biom(fn.biom)
## Warning in strsplit(conditionMessage(e), "\n"): input string 1 is invalid in
## this locale
st.hash <- t(as(biom_data(biom.hash), "matrix")) # Now, rows are samples and columns are ASVs
This biom file was generated by QIIME2 running the q2-dada2 plugin, and the ASVs are stored as hashes rather than as the sequences themselves. Using the representative sequence fasta to translate the hashes back to DNA sequences.
fn.seqs <- "../Data/dna-sequences.fasta" # CHANGE ME IF working directory is different than Rmd location
sq <- getSequences(fn.seqs)
if(!identical(colnames(st.hash), names(sq))) stop("Mismatch between hashed ASV IDs.")
st <- st.hash
colnames(st) <- sq # Sequence table now "DADA2 format": Rows=samples, Columns=ASVs, corresponding row/colnames
Read in the sample metadata.
fn.meta <- "../Data/bile_map.csv" # CHANGE ME IF working directory is different than Rmd location
df <- read.csv(fn.meta, header=TRUE)
rownames(df) <- df$SampleID
Check for consistency between sequence table and metadata table.
identical(sort(rownames(df)), sort(rownames(st))) # FALSE
## [1] FALSE
rownames(df)[!rownames(df) %in% rownames(st)]
## character(0)
rownames(st)[!rownames(st) %in% rownames(df)]
## [1] "726-518" "727-520"
Samples “726-518” and “727-520” are in the sequence table, but not in the metadata table.
Coordinating the sequence table and metadata table (and removing samples without metadata).
sam <- rownames(df)
df <- df[sam,]
st <- st[sam,]
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
A brief glance at the data we are working with.
dim(df)
## [1] 77 93
dim(st)
## [1] 77 11069
77 samples. 93 metadata columns. 10,000+ ASVs.
Inspect metadata columns.
colnames(df)
## [1] "FacilityID"
## [2] "DNA_Conc"
## [3] "Library_Conc"
## [4] "SampleID"
## [5] "i7Indexes"
## [6] "i5Indexes"
## [7] "SampleType"
## [8] "Filler"
## [9] "CaseNumber"
## [10] "MatchedSample"
## [11] "Bactibilia"
## [12] "Organisms"
## [13] "BacteriaPLUS.BacteriaNEG"
## [14] "LifeStage"
## [15] "Sex"
## [16] "Breed"
## [17] "Patient.Control"
## [18] "FebrileLESSTHANEQUAL103HyperthermicorNormothermic"
## [19] "PCV"
## [20] "PP"
## [21] "PLT"
## [22] "WBC"
## [23] "SEG"
## [24] "BAND"
## [25] "TOTALLYMPH"
## [26] "MONO"
## [27] "EOS"
## [28] "BUN"
## [29] "CREAT"
## [30] "ALT"
## [31] "ALP"
## [32] "GGT"
## [33] "AST"
## [34] "CK"
## [35] "TBILI"
## [36] "ALB"
## [37] "GLOB"
## [38] "CHOL"
## [39] "GLU"
## [40] "ClinicalDiagnosisofCholangitisvsOther"
## [41] "Inflammatorycellsoncytology"
## [42] "CulturePLUSmultiple.single.none"
## [43] "GIPanelPerformed"
## [44] "fPLI"
## [45] "Cobalamin.Folate"
## [46] "Antibioticusagecurrentorwithinpast30d"
## [47] "GBabnormalitiesPLUSorNEG"
## [48] "Wallthickening"
## [49] "Wallhyperechogenicity"
## [50] "GBSludge"
## [51] "CBDDilation"
## [52] "CBDWallThickening"
## [53] "Choleliths"
## [54] "Mucosalhyperplasia"
## [55] "PancreasabnormalitiesPLUSorNEG"
## [56] "Hypoechoicparenchyma"
## [57] "Heterogeneousparenchyma"
## [58] "Hyperechoicmesentery"
## [59] "Enlarged"
## [60] "Nodules"
## [61] "IntestinalabnormalitiesPLUSorNEG"
## [62] "Lossofwalllayering"
## [63] "Thickenedlayers"
## [64] "Abnormallayer"
## [65] "Dilation"
## [66] "LiverabnormalitiesPLUSorNEG"
## [67] "Hypoechoic"
## [68] "Hyperechoic"
## [69] "Gallbladder.Liverabnormalities"
## [70] "LymphnodeabnormalitiesYorN"
## [71] "Abnormallymphnodes"
## [72] "Intestinal.Nodalabnormalities"
## [73] "PeritonealEffusion"
## [74] "ElevatedLEActivityALT.ALP.both"
## [75] "Cholestaticvs.Hepatocellularvs.Mixed"
## [76] "InflammationPLUS.InflammationNEG"
## [77] "Inflamm.Bacteria"
## [78] "ELE.Bacteria"
## [79] "GBAUS.Bacteria"
## [80] "Inflamm.ELE.Bacteria"
## [81] "Inflamm.GBAUS.Bacteria"
## [82] "ELE.GBAUS.Bacteria"
## [83] "Inflamm.ELE.GBAUS.Bacteria"
## [84] "Inflamm.ELE"
## [85] "Inflamm.GBAUS"
## [86] "ELE.GBAUS"
## [87] "Inflamm.ELE.GBAUS"
## [88] "InflammPLUSPancAUS"
## [89] "fPLiPLUSPancAUS"
## [90] "InflammPLUSfPLiPLUSPancAUS"
## [91] "LowB12.FolatePLUSITAUS"
## [92] "LowB12.FolatePLUSITAUSPLUSNodeAUS"
## [93] "HistopathMDPNOducts"
Begining with the columns that we will use to try to distinguish between contaminants and real sample sequences:
$DNA_Conc
: The DNA concentration prior to library normalization. ND is below limit of detection$Library_Conc
: DNA concentration after library normalization.$SampleType
: Positive, or NegativeSampling/NegativeIsolation/NegativeLibrary, all of which are negative samples of different typestable(df$SampleType, useNA="ifany")
##
## Negative_Isolation Negative_Library Negative_Sampling Positive
## 3 1 13 60
16 total negative controls, most of which are (13/16) sampling negatives (blank sampling instruments that were then carried all the way through the amplicon sequencing process).
Define a generic column with TRUE for all negative controls, and FALSE for real (positive) samples.
df$NegativeControl <- grepl("Negative", df$SampleType)
table(df$SampleType, df$NegativeControl)
##
## FALSE TRUE
## Negative_Isolation 0 3
## Negative_Library 0 1
## Negative_Sampling 0 13
## Positive 60 0
Inspect the DNA concentrations.
class(df$DNA_Conc)
## [1] "character"
table(df$DNA_Conc, useNA="ifany")
##
## 0.002627016 0.005254032 0.010508065 0.015762097 0.018389114 0.052540325
## 8 2 2 1 1 1
## 0.06304839 0.091945568 0.191772185 0.320495981 0.323122997 103.8275627
## 1 1 1 1 1 1
## 104.4606736 104.8205748 105.8503652 106.2785688 106.6805023 67.87684548
## 1 1 1 1 1 1
## 95.65491515 ND <NA>
## 1 48 1
$DNA_Conc
is mostly ND (i.e. below limit of detection), and is currently in character format since its a mix of numbers and ND strings.
Define a new numeric $conc.dna
column with the NDs replaced by zeroes.
df$conc.dna <- df$DNA_Conc
df$conc.dna[df$conc.dna == "ND"] <- 0
df$conc.dna <- as.numeric(df$conc.dna)
summary(df$conc.dna)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00000 0.00000 0.00000 10.48131 0.00525 106.68050 1
table(df$SampleType, df$conc.dna>0, useNA="ifany")
##
## FALSE TRUE <NA>
## Negative_Isolation 3 0 0
## Negative_Library 0 0 1
## Negative_Sampling 12 1 0
## Positive 33 27 0
DNA above the limit of detection occurs at a higher rate in the Positive (real) samples, which is good, and the one NA concentration corresponds with the negative library sample (as it should).
class(df$Library_Conc)
## [1] "numeric"
summary(df$Library_Conc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8679 3.4657 5.8902 11.1971 12.2150 64.5917
Quite different. Every sample has a detectable concentration after libraries are normalized to try to produce equimolar amounts of DNA.
ggplot(data=df, aes(x=conc.dna, y=Library_Conc, color=SampleType)) +
geom_point() +
# scale_x_log10() + scale_y_log10() +
xlab("DNA Concentration (sample)") + ylab("DNA Concentration (library)")
## Warning: Removed 1 rows containing missing values (geom_point).
In this sample set, where DNA concentrations of so many samples are initially below the limit of detection, the association between DNA concentration before and after library construction is fairly weak.
In sum, because such a large fraction of the pre-normalization concentrations are ND, and that is the more informative value for using the “frequency” method to detect contaminants, there is limited utility for frequency-based contaminant identification here.
Check the distribution of library sizes (read depths) for samples:
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
df$LibrarySize <- rowSums(st[rownames(df),])
ggplot(data=df, aes(x=rank(LibrarySize), y=LibrarySize, color=SampleType)) + geom_point()
sort(df$LibrarySize)[1:10]
## [1] 69 76 131 6200 14624 16588 17454 19347 19465 22015
Three very low depth samples (<200 reads). The rest all have >5000 reads.
Remove the low-depth samples.
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
sam <- rownames(df)[df$LibrarySize > 500]
df <- df[sam,]
st <- st[sam,]
table(df$SampleType, df$NegativeControl)
##
## FALSE TRUE
## Negative_Isolation 0 3
## Negative_Library 0 1
## Negative_Sampling 0 13
## Positive 57 0
Use the distribution of ASV prevalences and abundances to guide feature-wise pre-filtering.
Inspect the distribution of ASV prevalences.
table(colSums(st>0))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 465 9007 376 185 130 99 89 64 37 41 36 32 32 28 33 16
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
## 26 17 18 18 15 9 12 15 9 15 10 7 4 15 7 6
## 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 13 5 10 3 9 6 5 3 10 4 2 6 7 4 3 7
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
## 3 4 1 5 5 9 3 4 4 6 2 4 5 2 2 8
## 64 65 66 67 68 69 70 71 72 73
## 3 8 4 3 3 3 3 1 2 2
A very large number of ASVs that appear in only one or fewer samples.
Create a data.frame with rows for each ASV, and columns for prevalence and abundance across all samples, and only in positive samples.
df.asv <- data.frame(Sequence=colnames(st),
Prevalence=colSums(st>0),
Prevalence.Pos=colSums(st[!df$NegativeControl,]>0),
Abundance=colSums(st),
Abundance.Pos=colSums(st[!df$NegativeControl,]))
Inspect joint distribution of prevalence and abundance.
ggplot(data=df.asv, aes(x=Prevalence, y=Abundance)) +
geom_point() +
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
Inspect joint distribution of prevalence and abundance (again), but this time in only the positive samples.
ggplot(data=df.asv, aes(x=Prevalence.Pos, y=Abundance.Pos)) + geom_point() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
There are a small number of higher abundance but low prevalence ASVs, even when considering just the positive samples.
Choice of filtering threshold: Presence in at least 5 positive samples OR at least 1000 total reads in positive samples (to be permissive to potential taxa that might rarely, but truly, occur in real samples). That’s a 5/57 ~ 9% prevalence threshold.
st.filt <- st[,df.asv$Prevalence.Pos >= 5 | df.asv$Abundance.Pos >= 1000]
ncol(st.filt); ncol(st); ncol(st.filt)/ncol(st)
## [1] 749
## [1] 11069
## [1] 0.06766646
sum(st.filt)/sum(st)
## [1] 0.9172798
This filtering reduced the number of ASVs from ~11k to ~750, but kept >90% of total reads.
Assign taxonomies using Silva v138.1 for these ASVs.
### Uncomment the next two lines to regenerate the taxonomy table anew
### tax.filt <- assignTaxonomy(st.filt, "~/tax/silva_nr99_v138.1_train_set.fa.gz", multi=TRUE)
### saveRDS(tax.filt, "../RDS/tax_filt.rds")
tax.filt <- readRDS("../RDS/tax_filt.rds")
if(!identical(getSequences(tax.filt), getSequences(st.filt))) stop("Table mismatch: tax.filt, st.filt")
table(tax.filt[,"Phylum"], useNA="ifany")
##
## Actinobacteriota Bacteroidota Campylobacterota Cyanobacteria
## 52 172 4 6
## Deferribacterota Deinococcota Desulfobacterota Firmicutes
## 2 1 12 406
## Fusobacteriota Methylomirabilota Patescibacteria Proteobacteria
## 1 1 11 61
## Verrucomicrobiota <NA>
## 1 19
Phylum results look roughly in accordance with what would be expected from instestinal bacteria.
tapply(colSums(st.filt), tax.filt[,"Phylum"], sum)/sum(st.filt)
## Actinobacteriota Bacteroidota Campylobacterota Cyanobacteria
## 1.074908e-01 2.417620e-01 1.368817e-02 1.316098e-03
## Deferribacterota Deinococcota Desulfobacterota Firmicutes
## 1.162745e-02 4.446659e-05 9.959950e-03 4.777407e-01
## Fusobacteriota Methylomirabilota Patescibacteria Proteobacteria
## 5.916317e-05 7.800494e-05 7.776942e-03 1.173566e-01
## Verrucomicrobiota
## 8.174315e-03
Also at the read level: Bacteroidota and Firmicutes dominate the total abunbdance, with a good number of Actinobacteriota and Proteos as well (~10% of each). All others at 1\%
or much less.
Save the curated data in R format.
if(!identical(getSequences(tax.filt), getSequences(st.filt))) stop("Table mismatch: tax.filt, st.filt")
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
if(!identical(rownames(df), rownames(st.filt))) stop("Mismatched samples in df and st.")
save(st, st.filt, tax.filt, df, file="../RDS/bile.rda")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.5 decontam_1.11.1 dada2_1.19.2 Rcpp_1.0.7
## [5] biomformat_1.18.0
##
## loaded via a namespace (and not attached):
## [1] MatrixGenerics_1.2.1 Biobase_2.50.0
## [3] sass_0.4.0 jsonlite_1.7.2
## [5] bslib_0.3.1 RcppParallel_5.1.4
## [7] assertthat_0.2.1 highr_0.9
## [9] stats4_4.0.3 latticeExtra_0.6-29
## [11] GenomeInfoDbData_1.2.4 Rsamtools_2.6.0
## [13] yaml_2.2.1 pillar_1.6.3
## [15] lattice_0.20-45 glue_1.4.2
## [17] digest_0.6.28 GenomicRanges_1.42.0
## [19] RColorBrewer_1.1-2 XVector_0.30.0
## [21] colorspace_2.0-2 htmltools_0.5.2
## [23] Matrix_1.3-4 plyr_1.8.6
## [25] pkgconfig_2.0.3 ShortRead_1.48.0
## [27] zlibbioc_1.36.0 purrr_0.3.4
## [29] scales_1.1.1 jpeg_0.1-9
## [31] BiocParallel_1.24.1 tibble_3.1.5
## [33] farver_2.1.0 generics_0.1.0
## [35] IRanges_2.24.1 ellipsis_0.3.2
## [37] withr_2.4.2 SummarizedExperiment_1.20.0
## [39] BiocGenerics_0.36.1 magrittr_2.0.1
## [41] crayon_1.4.1 evaluate_0.14
## [43] fansi_0.5.0 hwriter_1.3.2
## [45] tools_4.0.3 lifecycle_1.0.1
## [47] matrixStats_0.61.0 stringr_1.4.0
## [49] Rhdf5lib_1.12.1 S4Vectors_0.28.1
## [51] munsell_0.5.0 DelayedArray_0.16.3
## [53] Biostrings_2.58.0 compiler_4.0.3
## [55] jquerylib_0.1.4 GenomeInfoDb_1.26.7
## [57] rlang_0.4.11 rhdf5_2.34.0
## [59] grid_4.0.3 RCurl_1.98-1.5
## [61] rhdf5filters_1.2.1 bitops_1.0-7
## [63] labeling_0.4.2 rmarkdown_2.11
## [65] gtable_0.3.0 DBI_1.1.1
## [67] reshape2_1.4.4 R6_2.5.1
## [69] GenomicAlignments_1.26.0 knitr_1.36
## [71] dplyr_1.0.5 fastmap_1.1.0
## [73] utf8_1.2.2 stringi_1.7.5
## [75] parallel_4.0.3 vctrs_0.3.8
## [77] png_0.1-7 tidyselect_1.1.1
## [79] xfun_0.26