Setup

library(dplyr)
library(tidyr)  
library(ggplot2)
library(tibble)
library(dada2)
library(phyloseq)
library(Biostrings)
# Input files
path_reads    <- "/qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-cucumber-reads-small/"
path_silva    <- "/qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-db/silva_nr99_v138.2_toGenus_trainset.fa.gz"


# Output files
path_filtered <- paste0(base, "filtered")
path_precomp  <- paste0(base, "precomputed")

dir.create(path_filtered, showWarnings = FALSE, recursive = TRUE)
dir.create(path_precomp,  showWarnings = FALSE, recursive = TRUE)
cat(path_reads)
## /qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-cucumber-reads-small/

Quality assessment

Reads are primer-trimmed (cutadapt, primers: F CCTACGGGNGGCWGCAG, R GGACTACHVGGGTATCTAATCC).

fnFs <- sort(list.files(path_reads, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path_reads, pattern = "_R2.fastq.gz", full.names = TRUE))
cat("Forward:", length(fnFs), "  Reverse:", length(fnRs), "\n")
## Forward: 4   Reverse: 4
sample.names <- sub("_R1\\.fastq\\.gz$", "", basename(fnFs))
head(sample.names, 8)
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"  
## [4] "Spontaneous_H48_R2"
options(bitmapType = "cairo")
plotQualityProfile(fnFs[1:4])

plotQualityProfile(fnRs[1:4])

Filtering and trimming

First we will create the list of names for the output files of the filtering step

filtFs <- file.path(path_filtered, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path_filtered, paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names

and then we can perform the actual filterAndTrim:

out <- filterAndTrim(
  fnFs, filtFs,
  fnRs, filtRs,
  truncLen    = c(260, 250),
  maxN        = 0,
  maxEE       = c(2, 2),
  truncQ      = 10,
  rm.phix     = TRUE,
  compress    = TRUE,
  multithread = TRUE
)
saveRDS(out, file.path(path_precomp, "out.rds"))

Now we can create a dataframe to collect statistics on our pipeline progress. First we will store the results of the filtering step:

filter_stats <- as.data.frame(out)
filter_stats$sample   <- rownames(filter_stats)
filter_stats$pct_kept <- round(100 * filter_stats$reads.out / filter_stats$reads.in, 1)
filter_stats[, c("sample", "reads.in", "reads.out", "pct_kept")]
##                                                            sample reads.in
## Lactobacillus_H48_R1_R1.fastq.gz Lactobacillus_H48_R1_R1.fastq.gz   232087
## Lactobacillus_H48_R2_R1.fastq.gz Lactobacillus_H48_R2_R1.fastq.gz   277388
## Spontaneous_H48_R1_R1.fastq.gz     Spontaneous_H48_R1_R1.fastq.gz   311391
## Spontaneous_H48_R2_R1.fastq.gz     Spontaneous_H48_R2_R1.fastq.gz   302367
##                                  reads.out pct_kept
## Lactobacillus_H48_R1_R1.fastq.gz    189922     81.8
## Lactobacillus_H48_R2_R1.fastq.gz    227640     82.1
## Spontaneous_H48_R1_R1.fastq.gz      253245     81.3
## Spontaneous_H48_R2_R1.fastq.gz      243119     80.4
cat("Median retained:", median(filter_stats$pct_kept), "%\n")
## Median retained: 81.55 %
cat("Minimum retained:", min(filter_stats$pct_kept), "%\n")
## Minimum retained: 80.4 %

Did we lose any sample? filterAndTrim() skips writing a file when a sample loses all its reads.

exists_F <- file.exists(filtFs)
exists_R <- file.exists(filtRs)

if (any(!exists_F | !exists_R)) {
  dropped <- sample.names[!exists_F | !exists_R]
  message("Excluded (no reads after filtering): ", paste(dropped, collapse = ", "))
  filtFs <- filtFs[exists_F & exists_R]
  filtRs <- filtRs[exists_F & exists_R]
  sample.names <- names(filtFs)
}
cat("Samples remaining:", length(sample.names), "\n")
## Samples remaining: 4

Error rate modelling

NextSeq 2000 produces binned quality scores; we use a modified loess function (Option 4) suited for sparse, discrete quality levels.

loessErrfun_mod4 <- function(trans) {
  qq  <- as.numeric(colnames(trans))
  est <- matrix(0, nrow = 0, ncol = length(qq))

  for (nti in c("A", "C", "G", "T")) {
    for (ntj in c("A", "C", "G", "T")) {
      if (nti != ntj) {
        errs  <- trans[paste0(nti, "2", ntj), ]
        tot   <- colSums(trans[paste0(nti, "2", c("A", "C", "G", "T")), ])
        rlogp <- log10((errs + 1) / tot)
        rlogp[is.infinite(rlogp)] <- NA

        df <- data.frame(q = qq, errs = errs, tot = tot, rlogp = rlogp)

        mod.lo <- loess(rlogp ~ q, df,
                        weights = log10(tot),
                        degree  = 1,
                        span    = 0.95)

        pred <- predict(mod.lo, qq)
        maxrli <- max(which(!is.na(pred)))
        minrli <- min(which(!is.na(pred)))
        pred[seq_along(pred) > maxrli] <- pred[[maxrli]]
        pred[seq_along(pred) < minrli] <- pred[[minrli]]
        est <- rbind(est, 10^pred)
      }
    }
  }

  MAX_ERROR_RATE <- 0.25
  MIN_ERROR_RATE <- 1e-7
  est[est > MAX_ERROR_RATE] <- MAX_ERROR_RATE
  est[est < MIN_ERROR_RATE] <- MIN_ERROR_RATE

  estorig <- est
  est <- est %>%
    data.frame() %>%
    mutate(across(everything(), ~ if_else(. < X40, X40, .))) %>%
    as.matrix()
  rownames(est) <- rownames(estorig)
  colnames(est) <- colnames(estorig)

  err <- rbind(
    1 - colSums(est[1:3, ]),  est[1:3, ],
    est[4, ],  1 - colSums(est[4:6, ]),  est[5:6, ],
    est[7:8, ], 1 - colSums(est[7:9, ]), est[9, ],
    est[10:12, ], 1 - colSums(est[10:12, ])
  )
  rownames(err) <- paste0(rep(c("A", "C", "G", "T"), each = 4), "2",
                           c("A", "C", "G", "T"))
  colnames(err) <- colnames(trans)
  return(err)
}
errF_file <- file.path(path_precomp, "errF.rds")
errR_file <- file.path(path_precomp, "errR.rds")

if (!file.exists(errF_file)) {
  errF <- learnErrors(filtFs, multithread = TRUE, nbases = 1e10,
                      errorEstimationFunction = loessErrfun_mod4, verbose = TRUE)
  saveRDS(errF, errF_file)
} else {
  errF <- readRDS(errF_file)
}

if (!file.exists(errR_file)) {
  errR <- learnErrors(filtRs, multithread = TRUE, nbases = 1e10,
                      errorEstimationFunction = loessErrfun_mod4, verbose = TRUE)
  saveRDS(errR, errR_file)
} else {
  errR <- readRDS(errR_file)
}
plotErrors(errF, nominalQ = TRUE)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).

plotErrors(errR, nominalQ = TRUE)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).

Denoising, merging and chimera removal

dadaFs_file <- file.path(path_precomp, "dadaFs.rds")
dadaRs_file <- file.path(path_precomp, "dadaRs.rds")

if (!file.exists(dadaFs_file)) {
  dadaFs <- dada(filtFs, err = errF, multithread = TRUE)
  saveRDS(dadaFs, dadaFs_file)
} else {
  dadaFs <- readRDS(dadaFs_file)
}

if (!file.exists(dadaRs_file)) {
  dadaRs <- dada(filtRs, err = errR, multithread = TRUE)
  saveRDS(dadaRs, dadaRs_file)
} else {
  dadaRs <- readRDS(dadaRs_file)
}

dadaFs[[1]]
## dada-class: object describing DADA2 denoising results
## 237 sequence variants were inferred from 24078 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
mergers_file <- file.path(path_precomp, "mergers.rds")

if (!file.exists(mergers_file)) {
  mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = TRUE)
  saveRDS(mergers, mergers_file)
} else {
  mergers <- readRDS(mergers_file)
}
seqtab <- makeSequenceTable(mergers)
cat("Dimensions (samples × ASVs):", dim(seqtab), "\n")
## Dimensions (samples × ASVs): 4 4516
seq_lengths <- table(nchar(getSequences(seqtab)))
barplot(seq_lengths, xlab = "Length (bp)", ylab = "ASVs",
        main = "ASV length distribution (before chimera removal)", las = 2)

seqtab_file <- file.path(path_precomp, "seqtab_nochim.rds")

if (!file.exists(seqtab_file)) {
  seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus",
                                       multithread = TRUE, verbose = TRUE)
  saveRDS(seqtab.nochim, seqtab_file)
} else {
  seqtab.nochim <- readRDS(seqtab_file)
}

cat("ASVs before:", ncol(seqtab), " After:", ncol(seqtab.nochim), "\n")
## ASVs before: 4516  After: 211
cat("Reads retained:", round(100 * sum(seqtab.nochim) / sum(seqtab), 1), "%\n")
## Reads retained: 87.4 %
seq_lengths_nc <- table(nchar(getSequences(seqtab.nochim)))
barplot(seq_lengths_nc, xlab = "Length (bp)", ylab = "ASVs",
        main = "ASV length distribution (after chimera removal)", las = 2)

Read tracking

getN <- function(x) sum(getUniques(x))

# Rename out's rows from filenames to sample names
rownames(out) <- sub("_R1\\.fastq\\.gz$", "", rownames(out))
cat("rownames(out):\n"); print(rownames(out))
## rownames(out):
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"  
## [4] "Spontaneous_H48_R2"
cat("\nsample.names:\n"); print(sample.names)
## 
## sample.names:
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"  
## [4] "Spontaneous_H48_R2"
cat("\nMatch check:\n"); print(sample.names %in% rownames(out))
## 
## Match check:
## [1] TRUE TRUE TRUE TRUE
# Now subset to surviving samples
out_df <- as.data.frame(out)

track <- data.frame(
  input     = out_df[sample.names, 1],
  filtered  = out_df[sample.names, 2],
  denoisedF = sapply(dadaFs[sample.names], getN),
  denoisedR = sapply(dadaRs[sample.names], getN),
  merged    = sapply(mergers[sample.names], getN),
  nonchim   = rowSums(seqtab.nochim[sample.names, , drop = FALSE])
)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names

write.csv(track, "dada2_QC.csv")
track
##                       input filtered denoisedF denoisedR merged nonchim
## Lactobacillus_H48_R1 232087   189922    189511    189681 187228  176223
## Lactobacillus_H48_R2 277388   227640    226897    227483 226216  221605
## Spontaneous_H48_R1   311391   253245    252485    252945 227885  196358
## Spontaneous_H48_R2   302367   243119    241862    242432 232799  170092
step_labels <- c(input = "Raw Input", filtered = "Quality Filtered",
                 denoisedF = "Denoised (Fwd)", denoisedR = "Denoised (Rev)",
                 merged = "Merged Pairs", nonchim = "Non-chimeric")

as.data.frame(track) %>%
  rownames_to_column("Sample") %>%
  pivot_longer(-Sample, names_to = "Step", values_to = "Reads") %>%
  mutate(Step = factor(Step, levels = names(step_labels))) %>%
  ggplot(aes(x = Step, y = Reads, fill = Step)) +
  geom_boxplot(outlier.size = 1) +
  scale_x_discrete(labels = step_labels) +
  scale_fill_viridis_d() +
  labs(x = "Pipeline step", y = "Reads", title = "Read counts through the DADA2 pipeline") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

Taxonomy assignment

Uses the SILVA v138.2 reference database (Kingdom → Genus, naive Bayesian classifier).

taxa_file <- file.path(path_precomp, "taxa.rds")

if (!file.exists(taxa_file)) {
  taxa <- assignTaxonomy(seqtab.nochim, path_silva, multithread = TRUE, verbose = TRUE)
  saveRDS(taxa, taxa_file)
} else {
  taxa <- readRDS(taxa_file)
}

taxa.print <- taxa
rownames(taxa.print) <- NULL
head(taxa.print, 10)
##       Kingdom    Phylum           Class                 Order             
##  [1,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [2,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [3,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [4,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [5,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [6,] "Bacteria" "Pseudomonadota" "Gammaproteobacteria" "Enterobacterales"
##  [7,] "Bacteria" "Pseudomonadota" "Gammaproteobacteria" "Enterobacterales"
##  [8,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##  [9,] "Bacteria" "Bacillota"      "Bacilli"             "Staphylococcales"
## [10,] "Bacteria" "Bacillota"      "Bacilli"             "Lactobacillales" 
##       Family               Genus                
##  [1,] "Lactobacillaceae"   "Lactiplantibacillus"
##  [2,] "Lactobacillaceae"   "Lactiplantibacillus"
##  [3,] "Lactobacillaceae"   "Leuconostoc"        
##  [4,] "Streptococcaceae"   "Lactococcus"        
##  [5,] "Enterococcaceae"    "Enterococcus"       
##  [6,] "Enterobacteriaceae" "Enterobacter"       
##  [7,] "Enterobacteriaceae" "Enterobacter"       
##  [8,] "Lactobacillaceae"   "Weissella"          
##  [9,] "Staphylococcaceae"  "Staphylococcus"     
## [10,] "Enterococcaceae"    "Enterococcus"
taxa_df <- as.data.frame(taxa)
levels  <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")

classif <- data.frame(
  Level = factor(levels, levels = levels),
  ASVs_classified = sapply(levels, function(lv) sum(!is.na(taxa_df[[lv]]))),
  Total = nrow(taxa_df)
) %>%
  mutate(Percentage = round(100 * ASVs_classified / Total, 1))

ggplot(classif, aes(x = Level, y = Percentage, fill = Level)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = paste0(Percentage, "%")), vjust = -0.4, size = 3.5) +
  scale_fill_viridis_d() +
  labs(title = "ASVs classified at each taxonomic level",
       x = "Taxonomic level", y = "% of ASVs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") +
  ylim(0, 110)

Phyloseq object and export

Sample names encode the experimental design: Group_Timepoint_Replicate.

metadata <- data.frame(
  SampleID  = sample.names,
  row.names = sample.names,
  stringsAsFactors = FALSE
) %>%
  mutate(
    Group = case_when(
      startsWith(SampleID, "Lactobacillus") ~ "Lactobacillus",
      startsWith(SampleID, "Spontaneous")   ~ "Spontaneous",
      startsWith(SampleID, "Commercial")    ~ "Commercial",
      startsWith(SampleID, "Control")       ~ "Control",
      TRUE ~ "Unknown"
    ),
    Timepoint = case_when(
      grepl("_H24_", SampleID) ~ "H24",
      grepl("_H48_", SampleID) ~ "H48",
      grepl("_W1_",  SampleID) ~ "W1",
      grepl("_W2_",  SampleID) ~ "W2",
      TRUE ~ NA_character_
    ),
    Replicate = as.integer(sub(".*_R(\\d+)$", "\\1", SampleID)),
    IsControl = Group == "Control"
  )

stopifnot(all(rownames(seqtab.nochim) %in% rownames(metadata)))
metadata
##                                  SampleID         Group Timepoint Replicate
## Lactobacillus_H48_R1 Lactobacillus_H48_R1 Lactobacillus       H48         1
## Lactobacillus_H48_R2 Lactobacillus_H48_R2 Lactobacillus       H48         2
## Spontaneous_H48_R1     Spontaneous_H48_R1   Spontaneous       H48         1
## Spontaneous_H48_R2     Spontaneous_H48_R2   Spontaneous       H48         2
##                      IsControl
## Lactobacillus_H48_R1     FALSE
## Lactobacillus_H48_R2     FALSE
## Spontaneous_H48_R1       FALSE
## Spontaneous_H48_R2       FALSE
metadata_matched <- metadata[rownames(seqtab.nochim), ]
seqtab_matched   <- seqtab.nochim[rownames(metadata_matched), ]
stopifnot(all(rownames(metadata_matched) == rownames(seqtab_matched)))

otu <- otu_table(t(seqtab_matched), taxa_are_rows = TRUE)
tax <- tax_table(taxa)
sd  <- sample_data(metadata_matched)

ps <- phyloseq(otu, sd, tax)

dna        <- DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps         <- merge_phyloseq(ps, dna)

taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 211 taxa and 4 samples ]
## sample_data() Sample Data:       [ 4 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 211 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 211 reference sequences ]

Remove mitochondrial and chloroplast sequences, then ASVs unclassified at Family level.

ps_filtered <- ps %>%
  subset_taxa(
    (is.na(Family) | !grepl("mitochondria", Family, ignore.case = TRUE)) &
    (is.na(Order)  | !grepl("mitochondria", Order,  ignore.case = TRUE)) &
    (is.na(Genus)  | !grepl("mitochondria", Genus,  ignore.case = TRUE)) &
    (is.na(Family) | !grepl("chloroplast",  Family, ignore.case = TRUE)) &
    (is.na(Order)  | !grepl("chloroplast",  Order,  ignore.case = TRUE)) &
    (is.na(Order)  | Order != "Chloroplastida")
  )

cat("Removed (mitochondria/chloroplasts):", ntaxa(ps) - ntaxa(ps_filtered), "\n")
## Removed (mitochondria/chloroplasts): 8
ps_final <- ps_filtered %>%
  subset_taxa(!is.na(Family) & Family != "")

cat("Removed (unclassified at Family):", ntaxa(ps_filtered) - ntaxa(ps_final), "\n")
## Removed (unclassified at Family): 0
cat("Final ASVs:", ntaxa(ps_final), "\n")
## Final ASVs: 203
cat("Total reads:", sum(sample_sums(ps_final)), "\n")
## Total reads: 758562
cat("\n=== FILTERING SUMMARY ===\n")
## 
## === FILTERING SUMMARY ===
cat("After chimera removal:            ", ntaxa(ps), "ASVs\n")
## After chimera removal:             211 ASVs
cat("After removing mito/chloroplasts: ", ntaxa(ps_filtered), "ASVs\n")
## After removing mito/chloroplasts:  203 ASVs
cat("After removing unclassified fam:  ", ntaxa(ps_final), "ASVs\n")
## After removing unclassified fam:   203 ASVs
cat("Samples:                          ", nsamples(ps_final), "\n")
## Samples:                           4
cat("Total reads:                      ", sum(sample_sums(ps_final)), "\n")
## Total reads:                       758562
saveRDS(ps_final, "phyloseq_object.rds")

asv_sequences <- data.frame(
  ASV_ID   = taxa_names(ps_final),
  Sequence = as.character(refseq(ps_final)),
  stringsAsFactors = FALSE
)
write.csv(asv_sequences, "asv_sequences_map.csv", row.names = FALSE)

writeXStringSet(refseq(ps_final), "asv_sequences_map.fasta")

asv_counts   <- as.data.frame(otu_table(ps_final))
tax_table_df <- as.data.frame(tax_table(ps_final))
write.csv(cbind(tax_table_df, asv_counts), "asv_tax_table.csv", row.names = TRUE)

sample_abundances <- as.data.frame(t(otu_table(ps_final)))
sample_metadata   <- as.data.frame(sample_data(ps_final))
write.csv(cbind(sample_metadata, sample_abundances), "samples_with_abundances.csv", row.names = TRUE)

cat("Saved: phyloseq_object.rds, asv_sequences_map.csv, asv_sequences_map.fasta,\n")
## Saved: phyloseq_object.rds, asv_sequences_map.csv, asv_sequences_map.fasta,
cat("       asv_tax_table.csv, samples_with_abundances.csv\n")
##        asv_tax_table.csv, samples_with_abundances.csv