Snakemake Pipeline for Raw FASTQ Processing with Terra Compatibility

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

Here is a snakemake pipeline that processes raw fastq files and outputs several tables for further processing in R.

I think a path to make this run on Terra is to use the snakemake offical docker container: https://hub.docker.com/r/snakemake/snakemake

Code Snippets

library(DECIPHER)
library(dada2)
library(Biostrings)

seqtab.nochim <- readRDS(snakemake@input[["seqtab"]])

dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs
load(snakemake@input[["decipher_db"]]) # CHANGE TO THE PATH OF YOUR TRAINING SET
ids <- IdTaxa(dna, trainingSet, strand = "both", processors = snakemake@threads, verbose = FALSE) # use all processors
ranks <- c("domain", "phylum", "class", "order", "family", "genus") # ranks of interest
# Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
        m <- match(ranks, x$rank)
        taxa <- x$taxon[m]
        taxa[startsWith(taxa, "unclassified_")] <- NA
        taxa
}))
colnames(taxid) <- ranks
rownames(taxid) <- getSequences(seqtab.nochim)

saveRDS(taxid, snakemake@output[["taxonomy_decipher"]])

R dada2 DECIPHER From line 1 of scripts/assign_taxonomy_decipher.R

library(dada2)
library(Biostrings)

seqtab.nochim <- readRDS(snakemake@input[["seqtab"]])

dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs

species_assignment <- assignSpecies(
                            dna,
                            snakemake@input[["silva_species_db"]],
                            allowMultiple = TRUE,
                            tryRC = TRUE,
                            n = 1000,
                            verbose = FALSE)


saveRDS(species_assignment, snakemake@output[["taxonomy_silva_species"]])

R dada2 From line 1 of scripts/assign_taxonomy_silva_species.R

wrapper:
    "0.79.0/bio/dada2/filter-trim"

SnakeMake From line 41 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/learn-errors"

SnakeMake From line 62 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/dereplicate-fastq"

SnakeMake From line 78 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/sample-inference"

SnakeMake From line 96 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/make-table"

SnakeMake From line 112 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/remove-chimeras"

SnakeMake From line 125 of main/Snakefile

wrapper:
    "0.79.0/bio/dada2/collapse-nomismatch"

SnakeMake From line 138 of main/Snakefile

script:
    "scripts/assign_taxonomy_decipher.R"

SnakeMake From line 150 of main/Snakefile

script:
    "scripts/assign_taxonomy_silva_species.R"

SnakeMake From line 162 of main/Snakefile

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           seqtab = readRDS(snakemake@input[[1]])
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::collapseNoMismatch()")
}

# Collapse sequences
taxa<-do.call(collapseNoMismatch, args)

# Store the resulting table as a RDS file
saveRDS(taxa, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R dada2 From line 11 of collapse-nomismatch/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list( fls = unlist(snakemake@input))
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::derepFastq()")
}
# Dereplicate
uniques<-do.call(derepFastq, args)

# Store as RDS file
saveRDS(uniques,snakemake@output[[1]])

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R dada2 From line 10 of dereplicate-fastq/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
        fwd = snakemake@input[["fwd"]],
        filt = snakemake@output[["filt"]],
        multithread=snakemake@threads
)
# Test if paired end input is passed
if(!is.null(snakemake@input[["rev"]]) & !is.null(snakemake@output[["filt_rev"]])){
        args<-c(args,
            rev = snakemake@input[["rev"]],
            filt.rev = snakemake@output[["filt_rev"]]
            )
}
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
    # Keeping only the named elements of the list for do.call()
    extra<-snakemake@params[ names(snakemake@params) != "" ]
    # Check if 'compress=' option is passed
    if(!is.null(extra[["compress"]])){
        stop("Remove the `compress=` option from `params`.\n",
            "The `compress` option is implicitly set here from the file extension.")
    } else {
        # Check if output files are given as compressed files
        # ex: in se version, all(TRUE, NULL) gives TRUE
        compressed <- c(
            endsWith(args[["filt"]], '.gz'),
            if(is.null(args[["filt.rev"]])) NULL else {endsWith(args[["filt.rev"]], 'gz')}
        )
        if ( all(compressed) ) {
            extra[["compress"]] <- TRUE
        } else if ( any(compressed) ) {
            stop("Either all or no fastq output should be compressed. Please check `output.filt` and `output.filt_rev` for consistency.")
        } else {
            extra[["compress"]] <- FALSE
        }
    }
    # Add them to the list of arguments
    args<-c(args, extra)
} else {
    message("No optional parameters. Using default parameters from dada2::filterAndTrim()")
}

# Call the function with arguments
filt.stats<-do.call(filterAndTrim, args)

# Write processed reads report
write.table(filt.stats, snakemake@output[["stats"]], sep="\t", quote=F)
# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R dada2 From line 10 of filter-trim/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           fls = snakemake@input,
           multithread=snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using defaults parameters from dada2::learnErrors()")
}

# Learn errors rates for both read types
err<-do.call(learnErrors, args)

# Plot estimated versus observed error rates to validate models
perr<-plotErrors(err, nominalQ = TRUE)

# Save the plots
library(ggplot2)
ggsave(snakemake@output[["plot"]], perr, width = 8, height = 8, dpi = 300)

# Store the estimated errors as RDS files
saveRDS(err, snakemake@output[["err"]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R ggplot2 dada2 From line 10 of learn-errors/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# If names are provided use them
nm<-if(is.null(snakemake@params[["names"]])) NULL else snakemake@params[["names"]]

# From a list of n lists to one named list of n elements 
smps<-setNames(
               object=unlist(snakemake@input),
               nm=nm
               )
# Read the RDS into the list
smps<-lapply(smps, readRDS)

# Prepare arguments (no matter the order)
args<-list( samples = smps)
# Check if extra params are passed (apart from [["names"]])
if(length(snakemake@params) > 1 ){
       # Keeping only the named elements of the list for do.call() (apart from [["names"]])
       extra<-snakemake@params[ names(snakemake@params) != "" & names(snakemake@params) != "names" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::makeSequenceTable()")
}

# Make table
seqTab<-do.call(makeSequenceTable, args)

# Store the table as a RDS file
saveRDS(seqTab, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R dada2 From line 10 of make-table/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           unqs = readRDS(snakemake@input[[1]]),
           multithread=snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::removeBimeraDenovo()")
}

# Remove chimeras
seqTab_nochimeras<-do.call(removeBimeraDenovo, args)

# Store the estimated errors as RDS files
saveRDS(seqTab_nochimeras, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()

R dada2 From line 11 of remove-chimeras/wrapper.R

log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           derep = readRDS(snakemake@input[["derep"]]),
           err = readRDS(snakemake@input[["err"]]),
           multithread = snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::dada()")
}

# Learn errors rates for both read types
inferred_composition<-do.call(dada, args)

# Store the inferred sample composition as RDS files
saveRDS(inferred_composition, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()