library(dplyr)
library(tidyr)
library(ggplot2)
library(tibble)
library(dada2)
library(phyloseq)
library(Biostrings)
# Input files
path_reads <- "/qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-cucumber-reads-small/"
path_silva <- "/qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-db/silva_nr99_v138.2_toGenus_trainset.fa.gz"
# Output files
path_filtered <- paste0(base, "filtered")
path_precomp <- paste0(base, "precomputed")
dir.create(path_filtered, showWarnings = FALSE, recursive = TRUE)
dir.create(path_precomp, showWarnings = FALSE, recursive = TRUE)
cat(path_reads)
## /qib/platforms/Informatics/transfer/outgoing/training/metabarcoding-cucumber-reads-small/
Reads are primer-trimmed (cutadapt, primers: F
CCTACGGGNGGCWGCAG, R
GGACTACHVGGGTATCTAATCC).
fnFs <- sort(list.files(path_reads, pattern = "_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path_reads, pattern = "_R2.fastq.gz", full.names = TRUE))
cat("Forward:", length(fnFs), " Reverse:", length(fnRs), "\n")
## Forward: 4 Reverse: 4
sample.names <- sub("_R1\\.fastq\\.gz$", "", basename(fnFs))
head(sample.names, 8)
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"
## [4] "Spontaneous_H48_R2"
options(bitmapType = "cairo")
plotQualityProfile(fnFs[1:4])
plotQualityProfile(fnRs[1:4])
First we will create the list of names for the output files of the filtering step
filtFs <- file.path(path_filtered, paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path_filtered, paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
and then we can perform the actual filterAndTrim:
out <- filterAndTrim(
fnFs, filtFs,
fnRs, filtRs,
truncLen = c(260, 250),
maxN = 0,
maxEE = c(2, 2),
truncQ = 10,
rm.phix = TRUE,
compress = TRUE,
multithread = TRUE
)
saveRDS(out, file.path(path_precomp, "out.rds"))
Now we can create a dataframe to collect statistics on our pipeline progress. First we will store the results of the filtering step:
filter_stats <- as.data.frame(out)
filter_stats$sample <- rownames(filter_stats)
filter_stats$pct_kept <- round(100 * filter_stats$reads.out / filter_stats$reads.in, 1)
filter_stats[, c("sample", "reads.in", "reads.out", "pct_kept")]
## sample reads.in
## Lactobacillus_H48_R1_R1.fastq.gz Lactobacillus_H48_R1_R1.fastq.gz 232087
## Lactobacillus_H48_R2_R1.fastq.gz Lactobacillus_H48_R2_R1.fastq.gz 277388
## Spontaneous_H48_R1_R1.fastq.gz Spontaneous_H48_R1_R1.fastq.gz 311391
## Spontaneous_H48_R2_R1.fastq.gz Spontaneous_H48_R2_R1.fastq.gz 302367
## reads.out pct_kept
## Lactobacillus_H48_R1_R1.fastq.gz 189922 81.8
## Lactobacillus_H48_R2_R1.fastq.gz 227640 82.1
## Spontaneous_H48_R1_R1.fastq.gz 253245 81.3
## Spontaneous_H48_R2_R1.fastq.gz 243119 80.4
cat("Median retained:", median(filter_stats$pct_kept), "%\n")
## Median retained: 81.55 %
cat("Minimum retained:", min(filter_stats$pct_kept), "%\n")
## Minimum retained: 80.4 %
Did we lose any sample? filterAndTrim() skips writing a file when a sample loses all its reads.
exists_F <- file.exists(filtFs)
exists_R <- file.exists(filtRs)
if (any(!exists_F | !exists_R)) {
dropped <- sample.names[!exists_F | !exists_R]
message("Excluded (no reads after filtering): ", paste(dropped, collapse = ", "))
filtFs <- filtFs[exists_F & exists_R]
filtRs <- filtRs[exists_F & exists_R]
sample.names <- names(filtFs)
}
cat("Samples remaining:", length(sample.names), "\n")
## Samples remaining: 4
NextSeq 2000 produces binned quality scores; we use a modified loess function (Option 4) suited for sparse, discrete quality levels.
loessErrfun_mod4 <- function(trans) {
qq <- as.numeric(colnames(trans))
est <- matrix(0, nrow = 0, ncol = length(qq))
for (nti in c("A", "C", "G", "T")) {
for (ntj in c("A", "C", "G", "T")) {
if (nti != ntj) {
errs <- trans[paste0(nti, "2", ntj), ]
tot <- colSums(trans[paste0(nti, "2", c("A", "C", "G", "T")), ])
rlogp <- log10((errs + 1) / tot)
rlogp[is.infinite(rlogp)] <- NA
df <- data.frame(q = qq, errs = errs, tot = tot, rlogp = rlogp)
mod.lo <- loess(rlogp ~ q, df,
weights = log10(tot),
degree = 1,
span = 0.95)
pred <- predict(mod.lo, qq)
maxrli <- max(which(!is.na(pred)))
minrli <- min(which(!is.na(pred)))
pred[seq_along(pred) > maxrli] <- pred[[maxrli]]
pred[seq_along(pred) < minrli] <- pred[[minrli]]
est <- rbind(est, 10^pred)
}
}
}
MAX_ERROR_RATE <- 0.25
MIN_ERROR_RATE <- 1e-7
est[est > MAX_ERROR_RATE] <- MAX_ERROR_RATE
est[est < MIN_ERROR_RATE] <- MIN_ERROR_RATE
estorig <- est
est <- est %>%
data.frame() %>%
mutate(across(everything(), ~ if_else(. < X40, X40, .))) %>%
as.matrix()
rownames(est) <- rownames(estorig)
colnames(est) <- colnames(estorig)
err <- rbind(
1 - colSums(est[1:3, ]), est[1:3, ],
est[4, ], 1 - colSums(est[4:6, ]), est[5:6, ],
est[7:8, ], 1 - colSums(est[7:9, ]), est[9, ],
est[10:12, ], 1 - colSums(est[10:12, ])
)
rownames(err) <- paste0(rep(c("A", "C", "G", "T"), each = 4), "2",
c("A", "C", "G", "T"))
colnames(err) <- colnames(trans)
return(err)
}
errF_file <- file.path(path_precomp, "errF.rds")
errR_file <- file.path(path_precomp, "errR.rds")
if (!file.exists(errF_file)) {
errF <- learnErrors(filtFs, multithread = TRUE, nbases = 1e10,
errorEstimationFunction = loessErrfun_mod4, verbose = TRUE)
saveRDS(errF, errF_file)
} else {
errF <- readRDS(errF_file)
}
if (!file.exists(errR_file)) {
errR <- learnErrors(filtRs, multithread = TRUE, nbases = 1e10,
errorEstimationFunction = loessErrfun_mod4, verbose = TRUE)
saveRDS(errR, errR_file)
} else {
errR <- readRDS(errR_file)
}
plotErrors(errF, nominalQ = TRUE)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
plotErrors(errR, nominalQ = TRUE)
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 164 rows containing missing values or values outside the scale range
## (`geom_line()`).
dadaFs_file <- file.path(path_precomp, "dadaFs.rds")
dadaRs_file <- file.path(path_precomp, "dadaRs.rds")
if (!file.exists(dadaFs_file)) {
dadaFs <- dada(filtFs, err = errF, multithread = TRUE)
saveRDS(dadaFs, dadaFs_file)
} else {
dadaFs <- readRDS(dadaFs_file)
}
if (!file.exists(dadaRs_file)) {
dadaRs <- dada(filtRs, err = errR, multithread = TRUE)
saveRDS(dadaRs, dadaRs_file)
} else {
dadaRs <- readRDS(dadaRs_file)
}
dadaFs[[1]]
## dada-class: object describing DADA2 denoising results
## 237 sequence variants were inferred from 24078 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
mergers_file <- file.path(path_precomp, "mergers.rds")
if (!file.exists(mergers_file)) {
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose = TRUE)
saveRDS(mergers, mergers_file)
} else {
mergers <- readRDS(mergers_file)
}
seqtab <- makeSequenceTable(mergers)
cat("Dimensions (samples × ASVs):", dim(seqtab), "\n")
## Dimensions (samples × ASVs): 4 4516
seq_lengths <- table(nchar(getSequences(seqtab)))
barplot(seq_lengths, xlab = "Length (bp)", ylab = "ASVs",
main = "ASV length distribution (before chimera removal)", las = 2)
seqtab_file <- file.path(path_precomp, "seqtab_nochim.rds")
if (!file.exists(seqtab_file)) {
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus",
multithread = TRUE, verbose = TRUE)
saveRDS(seqtab.nochim, seqtab_file)
} else {
seqtab.nochim <- readRDS(seqtab_file)
}
cat("ASVs before:", ncol(seqtab), " After:", ncol(seqtab.nochim), "\n")
## ASVs before: 4516 After: 211
cat("Reads retained:", round(100 * sum(seqtab.nochim) / sum(seqtab), 1), "%\n")
## Reads retained: 87.4 %
seq_lengths_nc <- table(nchar(getSequences(seqtab.nochim)))
barplot(seq_lengths_nc, xlab = "Length (bp)", ylab = "ASVs",
main = "ASV length distribution (after chimera removal)", las = 2)
getN <- function(x) sum(getUniques(x))
# Rename out's rows from filenames to sample names
rownames(out) <- sub("_R1\\.fastq\\.gz$", "", rownames(out))
cat("rownames(out):\n"); print(rownames(out))
## rownames(out):
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"
## [4] "Spontaneous_H48_R2"
cat("\nsample.names:\n"); print(sample.names)
##
## sample.names:
## [1] "Lactobacillus_H48_R1" "Lactobacillus_H48_R2" "Spontaneous_H48_R1"
## [4] "Spontaneous_H48_R2"
cat("\nMatch check:\n"); print(sample.names %in% rownames(out))
##
## Match check:
## [1] TRUE TRUE TRUE TRUE
# Now subset to surviving samples
out_df <- as.data.frame(out)
track <- data.frame(
input = out_df[sample.names, 1],
filtered = out_df[sample.names, 2],
denoisedF = sapply(dadaFs[sample.names], getN),
denoisedR = sapply(dadaRs[sample.names], getN),
merged = sapply(mergers[sample.names], getN),
nonchim = rowSums(seqtab.nochim[sample.names, , drop = FALSE])
)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
write.csv(track, "dada2_QC.csv")
track
## input filtered denoisedF denoisedR merged nonchim
## Lactobacillus_H48_R1 232087 189922 189511 189681 187228 176223
## Lactobacillus_H48_R2 277388 227640 226897 227483 226216 221605
## Spontaneous_H48_R1 311391 253245 252485 252945 227885 196358
## Spontaneous_H48_R2 302367 243119 241862 242432 232799 170092
step_labels <- c(input = "Raw Input", filtered = "Quality Filtered",
denoisedF = "Denoised (Fwd)", denoisedR = "Denoised (Rev)",
merged = "Merged Pairs", nonchim = "Non-chimeric")
as.data.frame(track) %>%
rownames_to_column("Sample") %>%
pivot_longer(-Sample, names_to = "Step", values_to = "Reads") %>%
mutate(Step = factor(Step, levels = names(step_labels))) %>%
ggplot(aes(x = Step, y = Reads, fill = Step)) +
geom_boxplot(outlier.size = 1) +
scale_x_discrete(labels = step_labels) +
scale_fill_viridis_d() +
labs(x = "Pipeline step", y = "Reads", title = "Read counts through the DADA2 pipeline") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
Uses the SILVA v138.2 reference database (Kingdom → Genus, naive Bayesian classifier).
taxa_file <- file.path(path_precomp, "taxa.rds")
if (!file.exists(taxa_file)) {
taxa <- assignTaxonomy(seqtab.nochim, path_silva, multithread = TRUE, verbose = TRUE)
saveRDS(taxa, taxa_file)
} else {
taxa <- readRDS(taxa_file)
}
taxa.print <- taxa
rownames(taxa.print) <- NULL
head(taxa.print, 10)
## Kingdom Phylum Class Order
## [1,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [2,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [3,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [4,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [5,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [6,] "Bacteria" "Pseudomonadota" "Gammaproteobacteria" "Enterobacterales"
## [7,] "Bacteria" "Pseudomonadota" "Gammaproteobacteria" "Enterobacterales"
## [8,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## [9,] "Bacteria" "Bacillota" "Bacilli" "Staphylococcales"
## [10,] "Bacteria" "Bacillota" "Bacilli" "Lactobacillales"
## Family Genus
## [1,] "Lactobacillaceae" "Lactiplantibacillus"
## [2,] "Lactobacillaceae" "Lactiplantibacillus"
## [3,] "Lactobacillaceae" "Leuconostoc"
## [4,] "Streptococcaceae" "Lactococcus"
## [5,] "Enterococcaceae" "Enterococcus"
## [6,] "Enterobacteriaceae" "Enterobacter"
## [7,] "Enterobacteriaceae" "Enterobacter"
## [8,] "Lactobacillaceae" "Weissella"
## [9,] "Staphylococcaceae" "Staphylococcus"
## [10,] "Enterococcaceae" "Enterococcus"
taxa_df <- as.data.frame(taxa)
levels <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus")
classif <- data.frame(
Level = factor(levels, levels = levels),
ASVs_classified = sapply(levels, function(lv) sum(!is.na(taxa_df[[lv]]))),
Total = nrow(taxa_df)
) %>%
mutate(Percentage = round(100 * ASVs_classified / Total, 1))
ggplot(classif, aes(x = Level, y = Percentage, fill = Level)) +
geom_col(width = 0.7) +
geom_text(aes(label = paste0(Percentage, "%")), vjust = -0.4, size = 3.5) +
scale_fill_viridis_d() +
labs(title = "ASVs classified at each taxonomic level",
x = "Taxonomic level", y = "% of ASVs") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") +
ylim(0, 110)
Sample names encode the experimental design:
Group_Timepoint_Replicate.
metadata <- data.frame(
SampleID = sample.names,
row.names = sample.names,
stringsAsFactors = FALSE
) %>%
mutate(
Group = case_when(
startsWith(SampleID, "Lactobacillus") ~ "Lactobacillus",
startsWith(SampleID, "Spontaneous") ~ "Spontaneous",
startsWith(SampleID, "Commercial") ~ "Commercial",
startsWith(SampleID, "Control") ~ "Control",
TRUE ~ "Unknown"
),
Timepoint = case_when(
grepl("_H24_", SampleID) ~ "H24",
grepl("_H48_", SampleID) ~ "H48",
grepl("_W1_", SampleID) ~ "W1",
grepl("_W2_", SampleID) ~ "W2",
TRUE ~ NA_character_
),
Replicate = as.integer(sub(".*_R(\\d+)$", "\\1", SampleID)),
IsControl = Group == "Control"
)
stopifnot(all(rownames(seqtab.nochim) %in% rownames(metadata)))
metadata
## SampleID Group Timepoint Replicate
## Lactobacillus_H48_R1 Lactobacillus_H48_R1 Lactobacillus H48 1
## Lactobacillus_H48_R2 Lactobacillus_H48_R2 Lactobacillus H48 2
## Spontaneous_H48_R1 Spontaneous_H48_R1 Spontaneous H48 1
## Spontaneous_H48_R2 Spontaneous_H48_R2 Spontaneous H48 2
## IsControl
## Lactobacillus_H48_R1 FALSE
## Lactobacillus_H48_R2 FALSE
## Spontaneous_H48_R1 FALSE
## Spontaneous_H48_R2 FALSE
metadata_matched <- metadata[rownames(seqtab.nochim), ]
seqtab_matched <- seqtab.nochim[rownames(metadata_matched), ]
stopifnot(all(rownames(metadata_matched) == rownames(seqtab_matched)))
otu <- otu_table(t(seqtab_matched), taxa_are_rows = TRUE)
tax <- tax_table(taxa)
sd <- sample_data(metadata_matched)
ps <- phyloseq(otu, sd, tax)
dna <- DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 211 taxa and 4 samples ]
## sample_data() Sample Data: [ 4 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 211 taxa by 6 taxonomic ranks ]
## refseq() DNAStringSet: [ 211 reference sequences ]
Remove mitochondrial and chloroplast sequences, then ASVs unclassified at Family level.
ps_filtered <- ps %>%
subset_taxa(
(is.na(Family) | !grepl("mitochondria", Family, ignore.case = TRUE)) &
(is.na(Order) | !grepl("mitochondria", Order, ignore.case = TRUE)) &
(is.na(Genus) | !grepl("mitochondria", Genus, ignore.case = TRUE)) &
(is.na(Family) | !grepl("chloroplast", Family, ignore.case = TRUE)) &
(is.na(Order) | !grepl("chloroplast", Order, ignore.case = TRUE)) &
(is.na(Order) | Order != "Chloroplastida")
)
cat("Removed (mitochondria/chloroplasts):", ntaxa(ps) - ntaxa(ps_filtered), "\n")
## Removed (mitochondria/chloroplasts): 8
ps_final <- ps_filtered %>%
subset_taxa(!is.na(Family) & Family != "")
cat("Removed (unclassified at Family):", ntaxa(ps_filtered) - ntaxa(ps_final), "\n")
## Removed (unclassified at Family): 0
cat("Final ASVs:", ntaxa(ps_final), "\n")
## Final ASVs: 203
cat("Total reads:", sum(sample_sums(ps_final)), "\n")
## Total reads: 758562
cat("\n=== FILTERING SUMMARY ===\n")
##
## === FILTERING SUMMARY ===
cat("After chimera removal: ", ntaxa(ps), "ASVs\n")
## After chimera removal: 211 ASVs
cat("After removing mito/chloroplasts: ", ntaxa(ps_filtered), "ASVs\n")
## After removing mito/chloroplasts: 203 ASVs
cat("After removing unclassified fam: ", ntaxa(ps_final), "ASVs\n")
## After removing unclassified fam: 203 ASVs
cat("Samples: ", nsamples(ps_final), "\n")
## Samples: 4
cat("Total reads: ", sum(sample_sums(ps_final)), "\n")
## Total reads: 758562
saveRDS(ps_final, "phyloseq_object.rds")
asv_sequences <- data.frame(
ASV_ID = taxa_names(ps_final),
Sequence = as.character(refseq(ps_final)),
stringsAsFactors = FALSE
)
write.csv(asv_sequences, "asv_sequences_map.csv", row.names = FALSE)
writeXStringSet(refseq(ps_final), "asv_sequences_map.fasta")
asv_counts <- as.data.frame(otu_table(ps_final))
tax_table_df <- as.data.frame(tax_table(ps_final))
write.csv(cbind(tax_table_df, asv_counts), "asv_tax_table.csv", row.names = TRUE)
sample_abundances <- as.data.frame(t(otu_table(ps_final)))
sample_metadata <- as.data.frame(sample_data(ps_final))
write.csv(cbind(sample_metadata, sample_abundances), "samples_with_abundances.csv", row.names = TRUE)
cat("Saved: phyloseq_object.rds, asv_sequences_map.csv, asv_sequences_map.fasta,\n")
## Saved: phyloseq_object.rds, asv_sequences_map.csv, asv_sequences_map.fasta,
cat(" asv_tax_table.csv, samples_with_abundances.csv\n")
## asv_tax_table.csv, samples_with_abundances.csv