Import and Harmonize Data

Load packages.

library(biomformat); packageVersion("biomformat")
## [1] '1.18.0'
library(dada2); packageVersion("dada2")
## Loading required package: Rcpp
## [1] '1.19.2'
library(decontam); packageVersion("decontam")
## [1] '1.11.1'
library(ggplot2); packageVersion("ggplot2")
## [1] '3.3.5'

Read in the microbiome profile data.

fn.biom <- "../Data/feature-table.biom" # CHANGE ME IF working directory is different than Rmd location
biom.hash <- read_biom(fn.biom) 
## Warning in strsplit(conditionMessage(e), "\n"): input string 1 is invalid in
## this locale
st.hash <- t(as(biom_data(biom.hash), "matrix")) # Now, rows are samples and columns are ASVs

This biom file was generated by QIIME2 running the q2-dada2 plugin, and the ASVs are stored as hashes rather than as the sequences themselves. Using the representative sequence fasta to translate the hashes back to DNA sequences.

fn.seqs <- "../Data/dna-sequences.fasta" # CHANGE ME IF working directory is different than Rmd location
sq <- getSequences(fn.seqs)
if(!identical(colnames(st.hash), names(sq))) stop("Mismatch between hashed ASV IDs.")
st <- st.hash
colnames(st) <- sq # Sequence table now "DADA2 format": Rows=samples, Columns=ASVs, corresponding row/colnames

Read in the sample metadata.

fn.meta <- "../Data/bile_map.csv" # CHANGE ME IF working directory is different than Rmd location
df <- read.csv(fn.meta, header=TRUE)
rownames(df) <- df$SampleID

Check for consistency between sequence table and metadata table.

identical(sort(rownames(df)), sort(rownames(st))) # FALSE
## [1] FALSE
rownames(df)[!rownames(df) %in% rownames(st)]
## character(0)
rownames(st)[!rownames(st) %in% rownames(df)]
## [1] "726-518" "727-520"

Samples “726-518” and “727-520” are in the sequence table, but not in the metadata table.

Coordinating the sequence table and metadata table (and removing samples without metadata).

sam <- rownames(df)
df <- df[sam,]
st <- st[sam,]
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")

A brief glance at the data we are working with.

dim(df)
## [1] 77 93
dim(st)
## [1]    77 11069

77 samples. 93 metadata columns. 10,000+ ASVs.

Curate Data

Inspect metadata columns.

colnames(df)
##  [1] "FacilityID"                                       
##  [2] "DNA_Conc"                                         
##  [3] "Library_Conc"                                     
##  [4] "SampleID"                                         
##  [5] "i7Indexes"                                        
##  [6] "i5Indexes"                                        
##  [7] "SampleType"                                       
##  [8] "Filler"                                           
##  [9] "CaseNumber"                                       
## [10] "MatchedSample"                                    
## [11] "Bactibilia"                                       
## [12] "Organisms"                                        
## [13] "BacteriaPLUS.BacteriaNEG"                         
## [14] "LifeStage"                                        
## [15] "Sex"                                              
## [16] "Breed"                                            
## [17] "Patient.Control"                                  
## [18] "FebrileLESSTHANEQUAL103HyperthermicorNormothermic"
## [19] "PCV"                                              
## [20] "PP"                                               
## [21] "PLT"                                              
## [22] "WBC"                                              
## [23] "SEG"                                              
## [24] "BAND"                                             
## [25] "TOTALLYMPH"                                       
## [26] "MONO"                                             
## [27] "EOS"                                              
## [28] "BUN"                                              
## [29] "CREAT"                                            
## [30] "ALT"                                              
## [31] "ALP"                                              
## [32] "GGT"                                              
## [33] "AST"                                              
## [34] "CK"                                               
## [35] "TBILI"                                            
## [36] "ALB"                                              
## [37] "GLOB"                                             
## [38] "CHOL"                                             
## [39] "GLU"                                              
## [40] "ClinicalDiagnosisofCholangitisvsOther"            
## [41] "Inflammatorycellsoncytology"                      
## [42] "CulturePLUSmultiple.single.none"                  
## [43] "GIPanelPerformed"                                 
## [44] "fPLI"                                             
## [45] "Cobalamin.Folate"                                 
## [46] "Antibioticusagecurrentorwithinpast30d"            
## [47] "GBabnormalitiesPLUSorNEG"                         
## [48] "Wallthickening"                                   
## [49] "Wallhyperechogenicity"                            
## [50] "GBSludge"                                         
## [51] "CBDDilation"                                      
## [52] "CBDWallThickening"                                
## [53] "Choleliths"                                       
## [54] "Mucosalhyperplasia"                               
## [55] "PancreasabnormalitiesPLUSorNEG"                   
## [56] "Hypoechoicparenchyma"                             
## [57] "Heterogeneousparenchyma"                          
## [58] "Hyperechoicmesentery"                             
## [59] "Enlarged"                                         
## [60] "Nodules"                                          
## [61] "IntestinalabnormalitiesPLUSorNEG"                 
## [62] "Lossofwalllayering"                               
## [63] "Thickenedlayers"                                  
## [64] "Abnormallayer"                                    
## [65] "Dilation"                                         
## [66] "LiverabnormalitiesPLUSorNEG"                      
## [67] "Hypoechoic"                                       
## [68] "Hyperechoic"                                      
## [69] "Gallbladder.Liverabnormalities"                   
## [70] "LymphnodeabnormalitiesYorN"                       
## [71] "Abnormallymphnodes"                               
## [72] "Intestinal.Nodalabnormalities"                    
## [73] "PeritonealEffusion"                               
## [74] "ElevatedLEActivityALT.ALP.both"                   
## [75] "Cholestaticvs.Hepatocellularvs.Mixed"             
## [76] "InflammationPLUS.InflammationNEG"                 
## [77] "Inflamm.Bacteria"                                 
## [78] "ELE.Bacteria"                                     
## [79] "GBAUS.Bacteria"                                   
## [80] "Inflamm.ELE.Bacteria"                             
## [81] "Inflamm.GBAUS.Bacteria"                           
## [82] "ELE.GBAUS.Bacteria"                               
## [83] "Inflamm.ELE.GBAUS.Bacteria"                       
## [84] "Inflamm.ELE"                                      
## [85] "Inflamm.GBAUS"                                    
## [86] "ELE.GBAUS"                                        
## [87] "Inflamm.ELE.GBAUS"                                
## [88] "InflammPLUSPancAUS"                               
## [89] "fPLiPLUSPancAUS"                                  
## [90] "InflammPLUSfPLiPLUSPancAUS"                       
## [91] "LowB12.FolatePLUSITAUS"                           
## [92] "LowB12.FolatePLUSITAUSPLUSNodeAUS"                
## [93] "HistopathMDPNOducts"

Begining with the columns that we will use to try to distinguish between contaminants and real sample sequences:

table(df$SampleType, useNA="ifany")
## 
## Negative_Isolation   Negative_Library  Negative_Sampling           Positive 
##                  3                  1                 13                 60

16 total negative controls, most of which are (13/16) sampling negatives (blank sampling instruments that were then carried all the way through the amplicon sequencing process).

Define a generic column with TRUE for all negative controls, and FALSE for real (positive) samples.

df$NegativeControl <- grepl("Negative", df$SampleType)
table(df$SampleType, df$NegativeControl)
##                     
##                      FALSE TRUE
##   Negative_Isolation     0    3
##   Negative_Library       0    1
##   Negative_Sampling      0   13
##   Positive              60    0

Inspect the DNA concentrations.

class(df$DNA_Conc)
## [1] "character"
table(df$DNA_Conc, useNA="ifany")
## 
## 0.002627016 0.005254032 0.010508065 0.015762097 0.018389114 0.052540325 
##           8           2           2           1           1           1 
##  0.06304839 0.091945568 0.191772185 0.320495981 0.323122997 103.8275627 
##           1           1           1           1           1           1 
## 104.4606736 104.8205748 105.8503652 106.2785688 106.6805023 67.87684548 
##           1           1           1           1           1           1 
## 95.65491515          ND        <NA> 
##           1          48           1

$DNA_Conc is mostly ND (i.e. below limit of detection), and is currently in character format since its a mix of numbers and ND strings.

Define a new numeric $conc.dna column with the NDs replaced by zeroes.

df$conc.dna <- df$DNA_Conc
df$conc.dna[df$conc.dna == "ND"] <- 0
df$conc.dna <- as.numeric(df$conc.dna)
summary(df$conc.dna)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##   0.00000   0.00000   0.00000  10.48131   0.00525 106.68050         1
table(df$SampleType, df$conc.dna>0, useNA="ifany")
##                     
##                      FALSE TRUE <NA>
##   Negative_Isolation     3    0    0
##   Negative_Library       0    0    1
##   Negative_Sampling     12    1    0
##   Positive              33   27    0

DNA above the limit of detection occurs at a higher rate in the Positive (real) samples, which is good, and the one NA concentration corresponds with the negative library sample (as it should).

class(df$Library_Conc)
## [1] "numeric"
summary(df$Library_Conc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8679  3.4657  5.8902 11.1971 12.2150 64.5917

Quite different. Every sample has a detectable concentration after libraries are normalized to try to produce equimolar amounts of DNA.

ggplot(data=df, aes(x=conc.dna, y=Library_Conc, color=SampleType)) + 
  geom_point() + 
#  scale_x_log10() + scale_y_log10() +
  xlab("DNA Concentration (sample)") + ylab("DNA Concentration (library)")
## Warning: Removed 1 rows containing missing values (geom_point).

In this sample set, where DNA concentrations of so many samples are initially below the limit of detection, the association between DNA concentration before and after library construction is fairly weak.

In sum, because such a large fraction of the pre-normalization concentrations are ND, and that is the more informative value for using the “frequency” method to detect contaminants, there is limited utility for frequency-based contaminant identification here.

Curate samples

Check the distribution of library sizes (read depths) for samples:

if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
df$LibrarySize <- rowSums(st[rownames(df),])
ggplot(data=df, aes(x=rank(LibrarySize), y=LibrarySize, color=SampleType)) + geom_point()

sort(df$LibrarySize)[1:10]
##  [1]    69    76   131  6200 14624 16588 17454 19347 19465 22015

Three very low depth samples (<200 reads). The rest all have >5000 reads.

Remove the low-depth samples.

if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
sam <- rownames(df)[df$LibrarySize > 500]
df <- df[sam,]
st <- st[sam,]
table(df$SampleType, df$NegativeControl)
##                     
##                      FALSE TRUE
##   Negative_Isolation     0    3
##   Negative_Library       0    1
##   Negative_Sampling      0   13
##   Positive              57    0

Curate ASVs

Use the distribution of ASV prevalences and abundances to guide feature-wise pre-filtering.

Inspect the distribution of ASV prevalences.

table(colSums(st>0))
## 
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  465 9007  376  185  130   99   89   64   37   41   36   32   32   28   33   16 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##   26   17   18   18   15    9   12   15    9   15   10    7    4   15    7    6 
##   32   33   34   35   36   37   38   39   40   41   42   43   44   45   46   47 
##   13    5   10    3    9    6    5    3   10    4    2    6    7    4    3    7 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63 
##    3    4    1    5    5    9    3    4    4    6    2    4    5    2    2    8 
##   64   65   66   67   68   69   70   71   72   73 
##    3    8    4    3    3    3    3    1    2    2

A very large number of ASVs that appear in only one or fewer samples.

Create a data.frame with rows for each ASV, and columns for prevalence and abundance across all samples, and only in positive samples.

df.asv <- data.frame(Sequence=colnames(st), 
                     Prevalence=colSums(st>0),
                     Prevalence.Pos=colSums(st[!df$NegativeControl,]>0),
                     Abundance=colSums(st),
                     Abundance.Pos=colSums(st[!df$NegativeControl,]))

Inspect joint distribution of prevalence and abundance.

ggplot(data=df.asv, aes(x=Prevalence, y=Abundance)) + 
  geom_point() + 
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

Inspect joint distribution of prevalence and abundance (again), but this time in only the positive samples.

ggplot(data=df.asv, aes(x=Prevalence.Pos, y=Abundance.Pos)) + geom_point() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

There are a small number of higher abundance but low prevalence ASVs, even when considering just the positive samples.

Choice of filtering threshold: Presence in at least 5 positive samples OR at least 1000 total reads in positive samples (to be permissive to potential taxa that might rarely, but truly, occur in real samples). That’s a 5/57 ~ 9% prevalence threshold.

st.filt <- st[,df.asv$Prevalence.Pos >= 5 | df.asv$Abundance.Pos >= 1000]
ncol(st.filt); ncol(st); ncol(st.filt)/ncol(st)
## [1] 749
## [1] 11069
## [1] 0.06766646
sum(st.filt)/sum(st)
## [1] 0.9172798

This filtering reduced the number of ASVs from ~11k to ~750, but kept >90% of total reads.

Assign taxonomies using Silva v138.1 for these ASVs.

### Uncomment the next two lines to regenerate the taxonomy table anew
### tax.filt <- assignTaxonomy(st.filt, "~/tax/silva_nr99_v138.1_train_set.fa.gz", multi=TRUE)
### saveRDS(tax.filt, "../RDS/tax_filt.rds")
tax.filt <- readRDS("../RDS/tax_filt.rds")
if(!identical(getSequences(tax.filt), getSequences(st.filt))) stop("Table mismatch: tax.filt, st.filt")
table(tax.filt[,"Phylum"], useNA="ifany")
## 
##  Actinobacteriota      Bacteroidota  Campylobacterota     Cyanobacteria 
##                52               172                 4                 6 
##  Deferribacterota      Deinococcota  Desulfobacterota        Firmicutes 
##                 2                 1                12               406 
##    Fusobacteriota Methylomirabilota   Patescibacteria    Proteobacteria 
##                 1                 1                11                61 
## Verrucomicrobiota              <NA> 
##                 1                19

Phylum results look roughly in accordance with what would be expected from instestinal bacteria.

tapply(colSums(st.filt), tax.filt[,"Phylum"], sum)/sum(st.filt)
##  Actinobacteriota      Bacteroidota  Campylobacterota     Cyanobacteria 
##      1.074908e-01      2.417620e-01      1.368817e-02      1.316098e-03 
##  Deferribacterota      Deinococcota  Desulfobacterota        Firmicutes 
##      1.162745e-02      4.446659e-05      9.959950e-03      4.777407e-01 
##    Fusobacteriota Methylomirabilota   Patescibacteria    Proteobacteria 
##      5.916317e-05      7.800494e-05      7.776942e-03      1.173566e-01 
## Verrucomicrobiota 
##      8.174315e-03

Also at the read level: Bacteroidota and Firmicutes dominate the total abunbdance, with a good number of Actinobacteriota and Proteos as well (~10% of each). All others at 1\% or much less.

Save the curated data in R format.

if(!identical(getSequences(tax.filt), getSequences(st.filt))) stop("Table mismatch: tax.filt, st.filt")
if(!identical(rownames(df), rownames(st))) stop("Mismatched samples in df and st.")
if(!identical(rownames(df), rownames(st.filt))) stop("Mismatched samples in df and st.")
save(st, st.filt, tax.filt, df, file="../RDS/bile.rda")
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5     decontam_1.11.1   dada2_1.19.2      Rcpp_1.0.7       
## [5] biomformat_1.18.0
## 
## loaded via a namespace (and not attached):
##  [1] MatrixGenerics_1.2.1        Biobase_2.50.0             
##  [3] sass_0.4.0                  jsonlite_1.7.2             
##  [5] bslib_0.3.1                 RcppParallel_5.1.4         
##  [7] assertthat_0.2.1            highr_0.9                  
##  [9] stats4_4.0.3                latticeExtra_0.6-29        
## [11] GenomeInfoDbData_1.2.4      Rsamtools_2.6.0            
## [13] yaml_2.2.1                  pillar_1.6.3               
## [15] lattice_0.20-45             glue_1.4.2                 
## [17] digest_0.6.28               GenomicRanges_1.42.0       
## [19] RColorBrewer_1.1-2          XVector_0.30.0             
## [21] colorspace_2.0-2            htmltools_0.5.2            
## [23] Matrix_1.3-4                plyr_1.8.6                 
## [25] pkgconfig_2.0.3             ShortRead_1.48.0           
## [27] zlibbioc_1.36.0             purrr_0.3.4                
## [29] scales_1.1.1                jpeg_0.1-9                 
## [31] BiocParallel_1.24.1         tibble_3.1.5               
## [33] farver_2.1.0                generics_0.1.0             
## [35] IRanges_2.24.1              ellipsis_0.3.2             
## [37] withr_2.4.2                 SummarizedExperiment_1.20.0
## [39] BiocGenerics_0.36.1         magrittr_2.0.1             
## [41] crayon_1.4.1                evaluate_0.14              
## [43] fansi_0.5.0                 hwriter_1.3.2              
## [45] tools_4.0.3                 lifecycle_1.0.1            
## [47] matrixStats_0.61.0          stringr_1.4.0              
## [49] Rhdf5lib_1.12.1             S4Vectors_0.28.1           
## [51] munsell_0.5.0               DelayedArray_0.16.3        
## [53] Biostrings_2.58.0           compiler_4.0.3             
## [55] jquerylib_0.1.4             GenomeInfoDb_1.26.7        
## [57] rlang_0.4.11                rhdf5_2.34.0               
## [59] grid_4.0.3                  RCurl_1.98-1.5             
## [61] rhdf5filters_1.2.1          bitops_1.0-7               
## [63] labeling_0.4.2              rmarkdown_2.11             
## [65] gtable_0.3.0                DBI_1.1.1                  
## [67] reshape2_1.4.4              R6_2.5.1                   
## [69] GenomicAlignments_1.26.0    knitr_1.36                 
## [71] dplyr_1.0.5                 fastmap_1.1.0              
## [73] utf8_1.2.2                  stringi_1.7.5              
## [75] parallel_4.0.3              vctrs_0.3.8                
## [77] png_0.1-7                   tidyselect_1.1.1           
## [79] xfun_0.26