RNA-seq workflow using STAR and DESeq2

public public 1yr ago Version: v2.0.0 0 bookmarks

This workflow performs a differential gene expression analysis with STAR and Deseq2.

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above).

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
build = snakemake.params.build
release = int(snakemake.params.release)
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"


flavor = snakemake.params.get("flavor", "")
if flavor:
    flavor += "."


suffix = ""
if out_fmt == "gtf":
    suffix = "gtf.gz"
elif out_fmt == "gff3":
    suffix = "gff3.gz"
else:
    raise ValueError(
        "invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
    )


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    out_fmt=out_fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)


try:
    if out_gz:
        shell("curl -L {url} > {snakemake.output[0]} {log}")
    else:
        shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
__author__ = "Johannes Köster, Derek Croote"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")


# Parse memory
mem_mb = get_mem(snakemake, "MiB")


# Outdir
outdir = os.path.dirname(snakemake.output[0])
if outdir:
    outdir = f"--outdir {outdir}"


# Output compression
compress = ""
mem = f"-m{mem_mb}" if mem_mb else ""

for output in snakemake.output:
    out_name, out_ext = os.path.splitext(output)
    if out_ext == ".gz":
        compress += f"pigz -p {snakemake.threads} {out_name}; "
    elif out_ext == ".bz2":
        compress += f"pbzip2 -p{snakemake.threads} {mem} {out_name}; "


with tempfile.TemporaryDirectory() as tmpdir:
    mem = f"--mem {mem_mb}M" if mem_mb else ""

    shell(
        "(fasterq-dump --temp {tmpdir} --threads {snakemake.threads} {mem} "
        "{extra} {outdir} {snakemake.wildcards.accession}; "
        "{compress}"
        ") {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
import tempfile
from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


fq1 = snakemake.input.get("fq1")
assert fq1 is not None, "input-> fq1 is a required input parameter"
fq1 = (
    [snakemake.input.fq1]
    if isinstance(snakemake.input.fq1, str)
    else snakemake.input.fq1
)
fq2 = snakemake.input.get("fq2")
if fq2:
    fq2 = (
        [snakemake.input.fq2]
        if isinstance(snakemake.input.fq2, str)
        else snakemake.input.fq2
    )
    assert len(fq1) == len(
        fq2
    ), "input-> equal number of files required for fq1 and fq2"
input_str_fq1 = ",".join(fq1)
input_str_fq2 = ",".join(fq2) if fq2 is not None else ""
input_str = " ".join([input_str_fq1, input_str_fq2])


if fq1[0].endswith(".gz"):
    readcmd = "--readFilesCommand gunzip -c"
elif fq1[0].endswith(".bz2"):
    readcmd = "--readFilesCommand bunzip2 -c"
else:
    readcmd = ""


index = snakemake.input.get("idx")
if not index:
    index = snakemake.params.get("idx", "")


if "--outSAMtype BAM SortedByCoordinate" in extra:
    stdout = "BAM_SortedByCoordinate"
elif "BAM Unsorted" in extra:
    stdout = "BAM_Unsorted"
else:
    stdout = "SAM"


with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "STAR "
        " --runThreadN {snakemake.threads}"
        " --genomeDir {index}"
        " --readFilesIn {input_str}"
        " {readcmd}"
        " {extra}"
        " --outTmpDir {tmpdir}/STARtmp"
        " --outFileNamePrefix {tmpdir}/"
        " --outStd {stdout}"
        " > {snakemake.output.aln}"
        " {log}"
    )

    if snakemake.output.get("reads_per_gene"):
        shell("cat {tmpdir}/ReadsPerGene.out.tab > {snakemake.output.reads_per_gene:q}")
    if snakemake.output.get("chim_junc"):
        shell("cat {tmpdir}/Chimeric.out.junction > {snakemake.output.chim_junc:q}")
    if snakemake.output.get("sj"):
        shell("cat {tmpdir}/SJ.out.tab > {snakemake.output.sj:q}")
    if snakemake.output.get("log"):
        shell("cat {tmpdir}/Log.out > {snakemake.output.log:q}")
    if snakemake.output.get("log_progress"):
        shell("cat {tmpdir}/Log.progress.out > {snakemake.output.log_progress:q}")
    if snakemake.output.get("log_final"):
        shell("cat {tmpdir}/Log.final.out > {snakemake.output.log_final:q}")
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2019, Dayris Thibault"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")
sjdb_overhang = snakemake.params.get("sjdbOverhang", "100")

gtf = snakemake.input.get("gtf")
if gtf is not None:
    gtf = f"--sjdbGTFfile {gtf}"
    sjdb_overhang = f"--sjdbOverhang {sjdb_overhang}"
else:
    gtf = sjdb_overhang = ""

makedirs(snakemake.output)

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "STAR"
        " --runThreadN {snakemake.threads}"  # Number of threads
        " --runMode genomeGenerate"  # Indexation mode
        " --genomeFastaFiles {snakemake.input.fasta}"  # Path to fasta files
        " {sjdb_overhang}"  # Read-len - 1
        " {gtf}"  # Highly recommended GTF
        " {extra}"  # Optional parameters
        " --outTmpDir {tmpdir}/STARtmp"  # Temp dir
        " --genomeDir {snakemake.output}"  # Path to output
        " {log}"  # Logging
    )
15
16
wrapper:
    "v1.21.4/bio/star/align"
16
17
script:
    "../scripts/count-matrix.py"
31
32
script:
    "../scripts/gene2symbol.R"
46
47
script:
    "../scripts/deseq2-init.R"
59
60
script:
    "../scripts/plot-pca.R"
76
77
script:
    "../scripts/deseq2.R"
14
15
script:
    "../scripts/gtf2bed.py"
SnakeMake From line 14 of rules/qc.smk
32
33
34
shell:
    "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log[0]} 2>&1"
SnakeMake From line 32 of rules/qc.smk
51
52
53
shell:
    "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log} 2>&1"
SnakeMake From line 51 of rules/qc.smk
66
67
shell:
    "bam_stat.py -i {input} > {output} 2> {log}"
SnakeMake From line 66 of rules/qc.smk
81
82
shell:
    "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 81 of rules/qc.smk
98
99
shell:
    "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 98 of rules/qc.smk
113
114
shell:
    "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 113 of rules/qc.smk
129
130
shell:
    "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 129 of rules/qc.smk
145
146
shell:
    "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 145 of rules/qc.smk
195
196
wrapper:
    "v1.21.4/bio/multiqc"
12
13
wrapper:
    "v1.21.4/bio/reference/ensembl-sequence"
SnakeMake From line 12 of rules/ref.smk
28
29
wrapper:
    "v1.21.4/bio/reference/ensembl-annotation"
SnakeMake From line 28 of rules/ref.smk
40
41
wrapper:
    "v1.21.4/bio/samtools/faidx"
SnakeMake From line 40 of rules/ref.smk
54
55
wrapper:
    "v1.21.4/bio/bwa/index"
SnakeMake From line 54 of rules/ref.smk
70
71
wrapper:
    "v1.21.4/bio/star/index"
SnakeMake From line 70 of rules/ref.smk
7
8
wrapper:
    "v1.21.4/bio/sra-tools/fasterq-dump"
21
22
shell:
    "cat {input} > {output} 2> {log}"
SnakeMake From line 21 of rules/trim.smk
38
39
wrapper:
    "v1.21.4/bio/cutadapt/pe"
SnakeMake From line 38 of rules/trim.smk
54
55
wrapper:
    "v1.21.4/bio/cutadapt/se"
SnakeMake From line 54 of rules/trim.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys

# logging
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_column(strandedness):
    if pd.isnull(strandedness) or strandedness == "none":
        return 1  # non stranded protocol
    elif strandedness == "yes":
        return 2  # 3rd column
    elif strandedness == "reverse":
        return 3  # 4th column, usually for Illumina truseq
    else:
        raise ValueError(
            (
                "'strandedness' column should be empty or have the "
                "value 'none', 'yes' or 'reverse', instead has the "
                "value {}"
            ).format(repr(strandedness))
        )


counts = [
    pd.read_table(
        f, index_col=0, usecols=[0, get_column(strandedness)], header=None, skiprows=4
    )
    for f, strandedness in zip(snakemake.input, snakemake.params.strand)
]

for t, sample in zip(counts, snakemake.params.samples):
    t.columns = [sample]

matrix = pd.concat(counts, axis=1)
matrix.index.name = "gene"
# collapse technical replicates
matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
matrix.to_csv(snakemake.output[0], sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type="message")

library(stringr)
library("DESeq2")

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

counts_data <- read.table(
  snakemake@input[["counts"]],
  header = TRUE,
  row.names = "gene",
  check.names = FALSE
)
counts_data <- counts_data[, order(names(counts_data))]

col_data <- read.table(
  snakemake@config[["samples"]],
  header = TRUE,
  row.names = "sample_name",
  check.names = FALSE
)
col_data <- col_data[order(row.names(col_data)), , drop = FALSE]

# properly set the base level to the configuration in config.yaml, avoiding
# the default behaviour of choosing the alphabetical minimum level
for (vof in names(snakemake@config[["diffexp"]][["variables_of_interest"]])) {
  snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]]
  base_level <- snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]][["base_level"]]
  col_data[[vof]] <- relevel(
    factor(col_data[[vof]]), base_level
  )
}

# properly turn all batch effects into factors, even if they are numeric
batch_effects <- snakemake@config[["diffexp"]][["batch_effects"]]
for (effect in batch_effects) {
  if (str_length(effect) > 0) {
    col_data[[effect]] <- factor(col_data[[effect]])
  }
}

# build up formula with additive batch_effects and all interactions between the
# variables_of_interes

design_formula <- snakemake@config[["diffexp"]][["model"]]

if (str_length(design_formula) == 0) {
  batch_effects <- str_flatten(batch_effects, " + ")
  if (str_length(batch_effects) > 0) {
    batch_effects <- str_c(batch_effects, " + ")
  }
  vof_interactions <- str_flatten(
    names(snakemake@config[["diffexp"]][["variables_of_interest"]]),
    " * "
  )
  design_formula <- str_c("~", batch_effects, vof_interactions)
}

dds <- DESeqDataSetFromMatrix(
  countData = counts_data,
  colData = col_data,
  design = as.formula(design_formula)
)

# remove uninformative columns
dds <- dds[rowSums(counts(dds)) > 1, ]
# normalization and preprocessing
dds <- DESeq(dds, parallel = parallel)

# Write dds object as RDS
saveRDS(dds, file = snakemake@output[[1]])
# Write normalized counts
norm_counts <- counts(dds, normalized = TRUE)
write.table(
  data.frame(
    "gene" = rownames(norm_counts),
    norm_counts
  ),
  file = snakemake@output[[2]],
  sep = "\t",
  row.names = FALSE
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("cli")
library("DESeq2")

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

dds <- readRDS(snakemake@input[[1]])

contrast_config <- snakemake@config[["diffexp"]][["contrasts"]][[
    snakemake@wildcards[["contrast"]]
]]

# basic case of contrast specification, see:
# https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts
if (length(contrast_config) == 2 && typeof(contrast_config) == "list") {
  if (
    # check for existence contrast's variable_of_interest to
    # provide useful error message
    !(contrast_config[["variable_of_interest"]] %in%
      names(snakemake@config[["diffexp"]][["variables_of_interest"]])
    )
  ) {
      cli_abort(
        c(
                "config.yaml: All variable_of_interest entries under `diffexp: contrasts:`",
          " " = "must also exist under `diffexp: variables_of_interest:`.",
          "x" = "Could not find variable_of_interest: {contrast_config[['variable_of_interest']]}",
          " " = "It was not among the `diffexp: variables_of_interest:`",
          " " = "{names(snakemake@config[['diffexp']][['variables_of_interest']])}",
          "i" = "Are there any typos in the contrasts' `variable_of_interest:` entries?"
        )
      )
  }
  contrast <- c(
    contrast_config[["variable_of_interest"]],
    contrast_config[["level_of_interest"]],
    snakemake@config[["diffexp"]][["variables_of_interest"]][[
      contrast_config[["variable_of_interest"]]
    ]][["base_level"]]
  )
# more complex contrast specification via list(c(), c()), see ?results docs of
# the DESeq2 package and this tutorial (plus the linked seqanswers thread):
# https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md
} else if (
    length(contrast_config) == 1 &&
    typeof(contrast_config) == "character"
  ) {
  contrast <- d <- eval(parse(text = contrast_config))
}

res <- results(
  dds,
  contrast = contrast,
  parallel = parallel
)
# shrink fold changes for lowly expressed genes
# use ashr so we can use `contrast` as conversion to coef is not trivial, see:
# https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators
res <- lfcShrink(
  dds,
  contrast = contrast,
  res = res,
  type = "ashr"
)

# sort by p-value
res <- res[order(res$padj), ]
# TODO explore IHW usage


# store results
svg(snakemake@output[["ma_plot"]])
plotMA(res, ylim = c(-2, 2))
dev.off()

write.table(
  data.frame(
    "gene" = rownames(res),
    res
  ),
  file = snakemake@output[["table"]],
  row.names = FALSE,
  sep = "\t"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
library(biomaRt)
library(tidyverse)

# this variable holds a mirror name until
# useEnsembl succeeds ("www" is last, because 
# of very frequent "Internal Server Error"s)
mart <- "useast"
rounds <- 0
while ( class(mart)[[1]] != "Mart" ) {
  mart <- tryCatch(
    {
      # done here, because error function does not
      # modify outer scope variables, I tried
      if (mart == "www") rounds <- rounds + 1
      # equivalent to useMart, but you can choose
      # the mirror instead of specifying a host
      biomaRt::useEnsembl(
        biomart = "ENSEMBL_MART_ENSEMBL",
        dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
        mirror = mart
      )
    },
    error = function(e) {
      # change or make configurable if you want more or
      # less rounds of tries of all the mirrors
      if (rounds >= 3) {
        stop(
        )
      }
      # hop to next mirror
      mart <- switch(mart,
                     useast = "uswest",
                     uswest = "asia",
                     asia = "www",
                     www = {
                       # wait before starting another round through the mirrors,
                       # hoping that intermittent problems disappear
                       Sys.sleep(30)
                       "useast"
                     }
              )
    }
  )
}


df <- read.table(snakemake@input[["counts"]], sep='\t', header=1)

g2g <- biomaRt::getBM(
            attributes = c( "ensembl_gene_id",
                            "external_gene_name"),
            filters = "ensembl_gene_id",
            values = df$gene,
            mart = mart,
            )

annotated <- merge(df, g2g, by.x="gene", by.y="ensembl_gene_id")
annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name)
annotated$external_gene_name <- NULL
write.table(annotated, snakemake@output[["symbol"]], sep='\t', row.names=F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
import gffutils

db = gffutils.create_db(
    snakemake.input[0],
    dbfn=snakemake.output.db,
    force=True,
    keep_order=True,
    merge_strategy="merge",
    sort_attribute_values=True,
    disable_infer_genes=True,
    disable_infer_transcripts=True,
)

with open(snakemake.output.bed, "w") as outfileobj:
    for tx in db.features_of_type("transcript", order_by="start"):
        bed = [s.strip() for s in db.bed12(tx).split("\t")]
        bed[3] = tx.id
        outfileobj.write("{}\n".format("\t".join(bed)))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("DESeq2")

# load deseq2 data
dds <- readRDS(snakemake@input[[1]])

# obtain normalized counts
counts <- rlog(dds, blind=FALSE)
svg(snakemake@output[[1]])
plotPCA(counts, intgroup = snakemake@wildcards[["variable"]])
dev.off()
ShowHide 30 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/snakemake-workflows/rna-seq-star-deseq2
Name: rna-seq-star-deseq2
Version: v2.0.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Accessed: 11
Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...