Snakemake Workflow: De Novo Transcriptome Assembly with HISAT2 and StringTie

public public 1yr ago 0 bookmarks

Snakemake workflow: De novo transcriptome assembly with Hisat2 and StringTie.

This pipeline comprises all steps of the 'new tuxedo suit' workflow published by Pertea et al. (1) and can be used to perform genome-guided de novo transcriptome assembly on bulk RNA-seq data with default parameters (without downstream analysis in R). Additionally, the piepline comprises:

  • adapter and quality trimmimg

  • read quality control with FASTQC

  • generation of a representative alingment file and bed file for the purpose of visualizing read coverage.

(1) Pertea, Mihaela; Kim, Daehwan; Pertea, Geo M.; Leek, Jeffrey T.; Salzberg, Steven L. (2016): Transcript-level expression analysis of RNA-seq experiments with HISAT, StringTie and Ballgown. In: Nature Protocols 11, 1650 EP

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Wibowo Arindrarto"
__copyright__ = "Copyright 2016, Wibowo Arindrarto"
__email__ = "[email protected]"
__license__ = "BSD"

from snakemake.shell import shell

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")
# Run log
log = snakemake.log_fmt_shell()

# Input file wrangling
reads = snakemake.input.get("reads")
if isinstance(reads, str):
    input_flags = "-U {0}".format(reads)
elif len(reads) == 1:
    input_flags = "-U {0}".format(reads[0])
elif len(reads) == 2:
    input_flags = "-1 {0} -2 {1}".format(*reads)
else:
    raise RuntimeError(
        "Reads parameter must contain at least 1 and at most 2"
        " input files.")

# Executed shell command
shell(
    "(hisat2 {extra} --threads {snakemake.threads}"
    " -x {snakemake.params.idx} {input_flags}"
    " | samtools view -Sbh -o {snakemake.output[0]} -)"
    " {log}")
74
75
wrapper:
	"0.35.0/bio/trimmomatic/pe"
94
95
script:
	"scripts/hisat2_wrapper0.34.0.py"
107
108
wrapper:
	"0.35.1/bio/samtools/sort"
SnakeMake From line 107 of master/Snakefile
115
116
wrapper:
	"0.35.1/bio/samtools/index"		
SnakeMake From line 115 of master/Snakefile
150
151
shell:
	"stringtie -v --merge -p {threads} -G {input.anno} -o {output} {input.gtf} 2> {log}"
164
165
shell:
	"gffcompare -G -r {input.anno} -o output/gffcompare/GFFcompare {input.st_transcripts}"
178
shell: "stringtie -e -B -p {threads} -G {input.anno} -o {output} {input.sbam}"
190
191
192
193
194
195
shell:
	"""
	cd output
	prepDE.py -i ballgown
	cd ..
	"""
SnakeMake From line 190 of master/Snakefile
212
213
wrapper:
	"0.34.0/bio/fastqc"
226
227
wrapper:
	"0.34.0/bio/fastqc"
239
240
wrapper:
	"0.35.1/bio/multiqc"
257
258
wrapper:
	"0.34.0/bio/fastqc"
274
275
wrapper:
	"0.35.1/bio/multiqc"
294
295
296
297
298
299
shell:
    """
    nreads=$(samtools view -c {input})
    rate=$(echo "scale=5;100000/$nreads" | bc)
    sambamba view -f bam -t 5 --subsampling-seed=42 -s $rate {input} | samtools sort -m 4G -@ 8 -T - > {output} 2> {log}
    """
311
312
wrapper:
    "0.35.1/bio/samtools/merge"
SnakeMake From line 311 of master/Snakefile
323
324
wrapper:
    "0.35.1/bio/samtools/index"
SnakeMake From line 323 of master/Snakefile
336
337
shell:
    "bamCoverage -b {input.bam} -o {output}"
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path
from tempfile import TemporaryDirectory

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

def basename_without_ext(file_path):
    """Returns basename of file path, without the file extension."""

    base = path.basename(file_path)

    split_ind = 2 if base.endswith(".gz") else 1
    base = ".".join(base.split(".")[:-split_ind])

    return base


# Run fastqc, since there can be race conditions if multiple jobs 
# use the same fastqc dir, we create a temp dir.
with TemporaryDirectory() as tempdir:
    shell("fastqc {snakemake.params} --quiet "
          "--outdir {tempdir} {snakemake.input[0]}"
          " {log}")

    # Move outputs into proper position.
    output_base = basename_without_ext(snakemake.input[0])
    html_path = path.join(tempdir, output_base + "_fastqc.html")
    zip_path = path.join(tempdir, output_base + "_fastqc.zip")

    if snakemake.output.html != html_path:
        shell("mv {html_path} {snakemake.output.html}")

    if snakemake.output.zip != zip_path:
        shell("mv {zip_path} {snakemake.output.zip}")
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
__author__ = "Johannes Köster, Jorge Langa"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


# Distribute available threads between trimmomatic itself and any potential pigz instances
def distribute_threads(input_files, output_files, available_threads):
    gzipped_input_files = sum(1 for file in input_files if file.endswith(".gz"))
    gzipped_output_files = sum(1 for file in output_files if file.endswith(".gz"))
    potential_threads_per_process = available_threads // (1 + gzipped_input_files + gzipped_output_files)
    if potential_threads_per_process > 0:
        # decompressing pigz creates at most 4 threads
        pigz_input_threads = min(4, potential_threads_per_process) if gzipped_input_files != 0 else 0
        pigz_output_threads = \
            (available_threads - pigz_input_threads * gzipped_input_files) // (1 + gzipped_output_files) \
                if gzipped_output_files != 0 else 0
        trimmomatic_threads = available_threads - pigz_input_threads * gzipped_input_files - \
                              pigz_output_threads * gzipped_output_files
    else:
        # not enough threads for pigz
        pigz_input_threads = 0
        pigz_output_threads = 0
        trimmomatic_threads = available_threads
    return trimmomatic_threads, pigz_input_threads, pigz_output_threads


def compose_input_gz(filename, threads):
    if filename.endswith(".gz") and threads > 0:
        return "<(pigz -p {threads} --decompress --stdout {filename})".format(
            threads=threads,
            filename=filename
        )
    return filename


def compose_output_gz(filename, threads, compression_level):
    if filename.endswith(".gz") and threads > 0:
        return ">(pigz -p {threads} {compression_level} > {filename})".format(
            threads=threads,
            compression_level=compression_level,
            filename=filename
        )
    return filename


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
compression_level = snakemake.params.get("compression_level", "-5")
trimmer = " ".join(snakemake.params.trimmer)

# Distribute threads
input_files = [snakemake.input.r1, snakemake.input.r2]
output_files = [snakemake.output.r1, snakemake.output.r1_unpaired,
                snakemake.output.r2, snakemake.output.r2_unpaired]

trimmomatic_threads, input_threads, output_threads = distribute_threads(
    input_files,
    output_files,
    snakemake.threads
)

input_r1, input_r2 = [compose_input_gz(filename, input_threads) for filename in input_files]

output_r1, output_r1_unp, output_r2, output_r2_unp = [
    compose_output_gz(filename, output_threads, compression_level) for filename in output_files
]

shell(
    "trimmomatic PE -threads {trimmomatic_threads} {extra} "
    "{input_r1} {input_r2} "
    "{output_r1} {output_r1_unp} "
    "{output_r2} {output_r2_unp} "
    "{trimmer} "
    "{log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell("samtools index {snakemake.params} {snakemake.input[0]} {snakemake.output[0]}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = (
    "" if snakemake.threads <= 1
    else " -@ {} ".format(snakemake.threads - 1)
)

shell("samtools merge {threads} {snakemake.params} "
      "{snakemake.output[0]} {snakemake.input}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell


prefix = os.path.splitext(snakemake.output[0])[0]

# Samtools takes additional threads through its option -@
# One thread for samtools
# Other threads are *additional* threads passed to the argument -@
threads = (
    "" if snakemake.threads <= 1
    else " -@ {} ".format(snakemake.threads - 1)
)

shell(
    "samtools sort {snakemake.params} {threads} -o {snakemake.output[0]} "
    "-T {prefix} {snakemake.input[0]}")
ShowHide 16 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/c-kroeger/snakemake-hisat2-stringtie
Name: snakemake-hisat2-stringtie
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...