Snakemake Pipeline for Raw FASTQ Processing with Terra Compatibility

public public 1yr ago 0 bookmarks

Here is a snakemake pipeline that processes raw fastq files and outputs several tables for further processing in R.

I think a path to make this run on Terra is to use the snakemake offical docker container: https://hub.docker.com/r/snakemake/snakemake

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
library(DECIPHER)
library(dada2)
library(Biostrings)

seqtab.nochim <- readRDS(snakemake@input[["seqtab"]])

dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs
load(snakemake@input[["decipher_db"]]) # CHANGE TO THE PATH OF YOUR TRAINING SET
ids <- IdTaxa(dna, trainingSet, strand = "both", processors = snakemake@threads, verbose = FALSE) # use all processors
ranks <- c("domain", "phylum", "class", "order", "family", "genus") # ranks of interest
# Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
        m <- match(ranks, x$rank)
        taxa <- x$taxon[m]
        taxa[startsWith(taxa, "unclassified_")] <- NA
        taxa
}))
colnames(taxid) <- ranks
rownames(taxid) <- getSequences(seqtab.nochim)

saveRDS(taxid, snakemake@output[["taxonomy_decipher"]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
library(dada2)
library(Biostrings)

seqtab.nochim <- readRDS(snakemake@input[["seqtab"]])

dna <- DNAStringSet(getSequences(seqtab.nochim)) # Create a DNAStringSet from the ASVs

species_assignment <- assignSpecies(
                            dna,
                            snakemake@input[["silva_species_db"]],
                            allowMultiple = TRUE,
                            tryRC = TRUE,
                            n = 1000,
                            verbose = FALSE)


saveRDS(species_assignment, snakemake@output[["taxonomy_silva_species"]])
41
42
wrapper:
    "0.79.0/bio/dada2/filter-trim"
SnakeMake From line 41 of main/Snakefile
62
63
wrapper:
    "0.79.0/bio/dada2/learn-errors"
SnakeMake From line 62 of main/Snakefile
78
79
wrapper:
    "0.79.0/bio/dada2/dereplicate-fastq"
SnakeMake From line 78 of main/Snakefile
96
97
wrapper:
    "0.79.0/bio/dada2/sample-inference"
SnakeMake From line 96 of main/Snakefile
112
113
wrapper:
    "0.79.0/bio/dada2/make-table"
SnakeMake From line 112 of main/Snakefile
125
126
wrapper:
    "0.79.0/bio/dada2/remove-chimeras"
SnakeMake From line 125 of main/Snakefile
138
139
wrapper:
    "0.79.0/bio/dada2/collapse-nomismatch"
SnakeMake From line 138 of main/Snakefile
150
151
script:
    "scripts/assign_taxonomy_decipher.R"
SnakeMake From line 150 of main/Snakefile
162
163
script:
    "scripts/assign_taxonomy_silva_species.R"
SnakeMake From line 162 of main/Snakefile
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           seqtab = readRDS(snakemake@input[[1]])
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::collapseNoMismatch()")
}

# Collapse sequences
taxa<-do.call(collapseNoMismatch, args)

# Store the resulting table as a RDS file
saveRDS(taxa, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list( fls = unlist(snakemake@input))
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::derepFastq()")
}
# Dereplicate
uniques<-do.call(derepFastq, args)

# Store as RDS file
saveRDS(uniques,snakemake@output[[1]])

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
        fwd = snakemake@input[["fwd"]],
        filt = snakemake@output[["filt"]],
        multithread=snakemake@threads
)
# Test if paired end input is passed
if(!is.null(snakemake@input[["rev"]]) & !is.null(snakemake@output[["filt_rev"]])){
        args<-c(args,
            rev = snakemake@input[["rev"]],
            filt.rev = snakemake@output[["filt_rev"]]
            )
}
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
    # Keeping only the named elements of the list for do.call()
    extra<-snakemake@params[ names(snakemake@params) != "" ]
    # Check if 'compress=' option is passed
    if(!is.null(extra[["compress"]])){
        stop("Remove the `compress=` option from `params`.\n",
            "The `compress` option is implicitly set here from the file extension.")
    } else {
        # Check if output files are given as compressed files
        # ex: in se version, all(TRUE, NULL) gives TRUE
        compressed <- c(
            endsWith(args[["filt"]], '.gz'),
            if(is.null(args[["filt.rev"]])) NULL else {endsWith(args[["filt.rev"]], 'gz')}
        )
        if ( all(compressed) ) {
            extra[["compress"]] <- TRUE
        } else if ( any(compressed) ) {
            stop("Either all or no fastq output should be compressed. Please check `output.filt` and `output.filt_rev` for consistency.")
        } else {
            extra[["compress"]] <- FALSE
        }
    }
    # Add them to the list of arguments
    args<-c(args, extra)
} else {
    message("No optional parameters. Using default parameters from dada2::filterAndTrim()")
}

# Call the function with arguments
filt.stats<-do.call(filterAndTrim, args)

# Write processed reads report
write.table(filt.stats, snakemake@output[["stats"]], sep="\t", quote=F)
# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           fls = snakemake@input,
           multithread=snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using defaults parameters from dada2::learnErrors()")
}

# Learn errors rates for both read types
err<-do.call(learnErrors, args)

# Plot estimated versus observed error rates to validate models
perr<-plotErrors(err, nominalQ = TRUE)

# Save the plots
library(ggplot2)
ggsave(snakemake@output[["plot"]], perr, width = 8, height = 8, dpi = 300)

# Store the estimated errors as RDS files
saveRDS(err, snakemake@output[["err"]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# If names are provided use them
nm<-if(is.null(snakemake@params[["names"]])) NULL else snakemake@params[["names"]]

# From a list of n lists to one named list of n elements 
smps<-setNames(
               object=unlist(snakemake@input),
               nm=nm
               )
# Read the RDS into the list
smps<-lapply(smps, readRDS)

# Prepare arguments (no matter the order)
args<-list( samples = smps)
# Check if extra params are passed (apart from [["names"]])
if(length(snakemake@params) > 1 ){
       # Keeping only the named elements of the list for do.call() (apart from [["names"]])
       extra<-snakemake@params[ names(snakemake@params) != "" & names(snakemake@params) != "names" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::makeSequenceTable()")
}

# Make table
seqTab<-do.call(makeSequenceTable, args)

# Store the table as a RDS file
saveRDS(seqTab, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           unqs = readRDS(snakemake@input[[1]]),
           multithread=snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::removeBimeraDenovo()")
}

# Remove chimeras
seqTab_nochimeras<-do.call(removeBimeraDenovo, args)

# Store the estimated errors as RDS files
saveRDS(seqTab_nochimeras, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
log.file<-file(snakemake@log[[1]],open="wt")
sink(log.file)
sink(log.file,type="message")

library(dada2)

# Prepare arguments (no matter the order)
args<-list(
           derep = readRDS(snakemake@input[["derep"]]),
           err = readRDS(snakemake@input[["err"]]),
           multithread = snakemake@threads
           )
# Check if extra params are passed
if(length(snakemake@params) > 0 ){
       # Keeping only the named elements of the list for do.call()
       extra<-snakemake@params[ names(snakemake@params) != "" ]
       # Add them to the list of arguments
       args<-c(args, extra)
} else{
    message("No optional parameters. Using default parameters from dada2::dada()")
}

# Learn errors rates for both read types
inferred_composition<-do.call(dada, args)

# Store the inferred sample composition as RDS files
saveRDS(inferred_composition, snakemake@output[[1]],compress = T)

# Proper syntax to close the connection for the log file
# but could be optional for Snakemake wrapper
sink(type="message")
sink()
ShowHide 16 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/elsherbini/amplicon_for_terra
Name: amplicon_for_terra
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...