Snakemake workflow for performing an RNA-seq analysis

public public 8mo ago 0 bookmarks

Snakemake workflow integrated with copier to initialize and performing an RNA-seq analysis.

Code Snippets

11
12
script:
    "../scripts/count-matrix.py"
31
32
script:
    "../scripts/map_gene_ids.py"
48
49
script:
    "../scripts/deseq2-init.R"
71
72
script:
    "../scripts/plot-pca.R"
89
90
script:
    "../scripts/deseq2.R"
113
114
115
116
117
118
119
120
shell:
    """
    (head -n1 {input}; \
    tail -n+2 {input} | awk 'BEGIN {{OFS="\\t"}} {{if ($7<=0.1 && $3>0) {{ \
            print $1,$2,$3,$4,$5,$6,$7}} \
        }}' | sort -k5,5nr -k3,3n) \
    > {output}
    """
128
129
130
131
132
133
134
135
shell:
    """
    (head -n1 {input}; \
    tail -n+2 {input} | awk 'BEGIN {{OFS="\\t"}} {{if ($7<=0.1 && $3<0) {{ \
            print $1,$2,$3,$4,$5,$6,$7}} \
        }}' | sort -k5,5nr -k3,3n) \
    > {output}
    """
22
23
shell:
    "cp -av --dereference {input} {output}"
10
11
script:
    "../scripts/gene_id_map.py"
 9
10
11
12
shell:
    "cp -v {input.fq1} {output.fq1}; "
    "cp -v {input.fq2} {output.fq2}; "
    "chmod 0644 {output.fq1} {output.fq2}"
36
37
wrapper:
    "v1.5.0/bio/star/align"
SnakeMake From line 36 of rules/map.smk
65
66
wrapper:
    "v1.5.0/bio/samtools/index"
SnakeMake From line 65 of rules/map.smk
84
shell: "bamCoverage --normalizeUsing RPKM --verbose -b {input[0]} -o {output[0]} -p {threads} &> {log}"
29
30
31
shell:
    "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log} 2>&1"
SnakeMake From line 29 of rules/qc.smk
48
49
50
shell:
    "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log} 2>&1"
SnakeMake From line 48 of rules/qc.smk
63
64
shell:
    "bam_stat.py -i {input} > {output} 2> {log}"
SnakeMake From line 63 of rules/qc.smk
78
79
shell:
    "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 78 of rules/qc.smk
95
96
shell:
    "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 95 of rules/qc.smk
110
111
shell:
    "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 110 of rules/qc.smk
126
127
shell:
    "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 126 of rules/qc.smk
142
143
shell:
    "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 142 of rules/qc.smk
168
169
wrapper:
    "v1.7.0/bio/multiqc"
24
25
wrapper:
    "v1.5.0/bio/cutadapt/pe"
SnakeMake From line 24 of rules/trim.smk
41
42
wrapper:
    "v1.5.0/bio/cutadapt/se"
SnakeMake From line 41 of rules/trim.smk
11
12
shell:
    "cut -f 1 {input} | sed 's/\"//g' | tail -n+2 > {output}"
22
23
script:
    "../scripts/get_diffexp_cds.py"
39
40
shell:
    "zcat {input.vcf} | bedtools intersect -a stdin -b {input.target_region_bed} -header -wa -u > {output}"
54
script: "../scripts/get_pathogenic_variants.py"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import pandas as pd

def get_column(strandedness):
    if pd.isnull(strandedness) or strandedness == "none":
        return 1 #non stranded protocol
    elif strandedness == "yes":
        return 2 #3rd column
    elif strandedness == "reverse":
        return 3 #4th column, usually for Illumina truseq
    else:
        raise ValueError(("'strandedness' column should be empty or have the "
                          "value 'none', 'yes' or 'reverse', instead has the "
                          "value {}").format(repr(strandedness)))

counts = [pd.read_table(f, index_col=0, usecols=[0, get_column(strandedness)],
          header=None, skiprows=4)
          for f, strandedness in zip(snakemake.input, snakemake.params.strand)]

for t, sample in zip(counts, snakemake.params.samples):
    t.columns = [sample]

matrix = pd.concat(counts, axis=1)
matrix.index.name = "gene"
# collapse technical replicates
matrix = matrix.groupby(matrix.columns, axis=1, sort=False).sum()
matrix.to_csv(snakemake.output[0], sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("DESeq2")

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

# colData and countData must have the same sample order, but this is ensured
# by the way we create the count matrix
cts <- read.table(snakemake@input[["counts"]], header=TRUE, row.names="gene", check.names=FALSE)
# cts <- cts[ , order(names(cts))]
coldata <- read.table(snakemake@params[["samples"]], header=TRUE, row.names="sample", check.names=FALSE)
# coldata <- coldata[order(row.names(coldata)), , drop=F]

# dds <- DESeqDataSetFromMatrix(countData=cts,
dds <- DESeqDataSetFromMatrix(countData=cts[,rownames(coldata)],
                              colData=coldata,
                              design=as.formula(snakemake@params[["design"]]))

# remove uninformative columns
dds <- dds[ rowSums(counts(dds)) > 1, ]
# normalization and preprocessing
dds <- DESeq(dds, parallel=parallel)

saveRDS(dds, file=snakemake@output[[1]])

# Write normalized counts
norm_counts = counts(dds, normalized=T)
write.table(data.frame("gene"=rownames(norm_counts), norm_counts), file=snakemake@output[[2]], sep='\t', row.names=F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("DESeq2")

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

dds <- readRDS(snakemake@input[[1]])

contrast <- c("condition", snakemake@params[["contrast"]])
res <- results(dds, contrast=contrast, parallel=parallel)
# shrink fold changes for lowly expressed genes
# use ashr so we can use `contrast` as conversion to coef is not trivial
# see https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators
res <- lfcShrink(dds, contrast=contrast, res=res, type="ashr")

# sort by p-value
res <- res[order(res$padj),]
# TODO explore IHW usage


# store results
svg(snakemake@output[["ma_plot"]])
plotMA(res, ylim=c(-2,2))
dev.off()

write.table(data.frame("gene"=rownames(res),res), file=snakemake@output[["table"]], row.names=FALSE, sep='\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
import gffutils

db = gffutils.FeatureDB(snakemake.input.db)

id_name_map = dict()

with open(snakemake.output.tsv, 'w') as outfileobj:
    outfileobj.write('{}\n'.format('\t'.join(['gene_id','gene_name'])))
    for gene in db.features_of_type('gene'):
        if "gene_id" in gene.attributes: 
            if "gene_name" in gene.attributes:
                id_name_map[gene.attributes["gene_id"][0]] = gene.attributes["gene_name"][0]
            else:
                id_name_map[gene.attributes["gene_id"][0]] = gene.attributes["gene_id"][0] 
    for gene_id,gene_name in id_name_map.items():
        outfileobj.write('{}\n'.format('\t'.join([gene_id,gene_name])))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from collections import defaultdict
import csv 

def get_genes_from_exon_bed(bed_filename, fieldnames=None):
    # Each gene has a list containing the exon rows from the BED file
    genes = defaultdict(lambda : [])
    with open(bed_filename, 'r') as bed_file:
        if fieldnames:
            reader = csv.DictReader(bed_file, delimiter='\t', fieldnames=fieldnames)
        else:
            reader = csv.DictReader(bed_file, delimiter='\t')

        for row in reader:
            gene_hgnc = row.get('gene_hgnc', None)
            if gene_hgnc:
                # genes[gene_hgnc].append(row)
                genes[row['ensembl']].append(row)

    return genes


def get_gene_list(filename):
    with open(filename, 'r') as gene_list_file:
        reader = csv.reader(gene_list_file)
        gene_list = [row[0] for row in reader if len(row) > 0]

    return gene_list


fieldnames=["chr","start","end","rank","ensembl","refseq","gene_hgnc"]


def write_gene_exons(bed_filename, gene_exons):
    with open(bed_filename, 'w') as bed_file:
        writer = csv.DictWriter(bed_file, delimiter='\t', fieldnames=fieldnames)

        for exon in gene_exons:
            writer.writerow(exon)


# Get the exons of every gene as a list in a dict where the key is the refseq accession number
all_genes = get_genes_from_exon_bed(snakemake.input.cds_regions)

# Pass in the field names here because the input file was created by bedtools which removes the header
target_genes = get_gene_list(snakemake.input.diffexp_genes)

gene_exons_to_keep = []
# gene_exons_to_discard = []

# print(all_genes)
# print(target_genes)

for gene_name in target_genes:
    # print(gene_name)
    if gene_name in all_genes.keys():
        gene_exons_to_keep.extend(all_genes[gene_name])

write_gene_exons(snakemake.output.bed, gene_exons_to_keep)
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from collections import defaultdict
import csv 
import vcf

# Initialize a defaultdict for genes to hold MC with genes
genes = defaultdict(lambda : defaultdict(lambda : 0))

# This will hold all possible molecular consequence values
# to be used for printing the final table
possible_mcs = []

def process_vcf(vcf_filename):
    with open(vcf_filename, 'r') as vcf_file:
        vcf_reader = vcf.Reader(vcf_file)
        for record in vcf_reader:
            clnsig = record.INFO.get("CLNSIG", [])

            # Only process record if it is pathogenic
            if "Pathogenic" in clnsig or "Pathogenic/Likely_pathogenic" in clnsig or "Likely_pathogenic" in clnsig:

                molecular_consequences = record.INFO.get("MC", [])
                molecular_consequences = [x.split("|")[1] for x in molecular_consequences]
                possible_mcs.extend(molecular_consequences)

                # Only process records with a gene attached
                gene_info = record.INFO.get("GENEINFO", None)
                if gene_info:
                    gene = gene_info.split(":")[0]
                    for mc in molecular_consequences:
                        genes[gene][mc] += 1


def write_tsv(mcs, genes_dict, tsv_filename):
    # The header of the tsv file
    header = ["gene_name"]
    header.extend(mcs)

    with open(tsv_filename, 'w') as tsvfile:
        writer = csv.writer(tsvfile, delimiter='\t')
        writer.writerow(header)
        for gene, mc_dict in genes_dict.items():
            gene_line = [gene]
            for mc in mcs:
                mc_count = mc_dict.get(mc, 0)
                gene_line.append(mc_count)

            writer.writerow(gene_line)



process_vcf(snakemake.input[0])

# Create a unique set of possible molecular consequences

possible_mcs = list(set(possible_mcs))
possible_mcs.sort()

write_tsv(possible_mcs, genes, snakemake.output[0])
1
2
3
4
5
6
7
8
import pandas as pd

counts = pd.read_table(snakemake.input.tsv, index_col=0)
gene_id_map = pd.read_table(snakemake.input.gene_id_map, index_col=0)

result = pd.concat([gene_id_map,counts],axis=1)

result.to_csv(snakemake.output.tsv, sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("DESeq2")

# load deseq2 data
dds <- readRDS(snakemake@input[[1]])

# obtain normalized counts
counts <- rlog(dds, blind=FALSE)
svg(snakemake@output[[1]])
plotPCA(counts, intgroup=snakemake@params[["pca_labels"]])
dev.off()
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " -j {snakemake.threads}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " {adapters}"
    " {extra}"
    " -j {snakemake.threads}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
import tempfile
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

fq1 = snakemake.input.get("fq1")
assert fq1 is not None, "input-> fq1 is a required input parameter"
fq1 = (
    [snakemake.input.fq1]
    if isinstance(snakemake.input.fq1, str)
    else snakemake.input.fq1
)
fq2 = snakemake.input.get("fq2")
if fq2:
    fq2 = (
        [snakemake.input.fq2]
        if isinstance(snakemake.input.fq2, str)
        else snakemake.input.fq2
    )
    assert len(fq1) == len(
        fq2
    ), "input-> equal number of files required for fq1 and fq2"
input_str_fq1 = ",".join(fq1)
input_str_fq2 = ",".join(fq2) if fq2 is not None else ""
input_str = " ".join([input_str_fq1, input_str_fq2])

if fq1[0].endswith(".gz"):
    readcmd = "--readFilesCommand zcat"
else:
    readcmd = ""

index = snakemake.input.get("idx")
if not index:
    index = snakemake.params.get("idx", "")

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "STAR "
        " --runThreadN {snakemake.threads}"
        " --genomeDir {index}"
        " --readFilesIn {input_str}"
        " {readcmd}"
        " {extra}"
        " --outTmpDir {tmpdir}/temp"
        " --outFileNamePrefix {tmpdir}/"
        " --outStd Log "
        " {log}"
    )

    if "SortedByCoordinate" in extra:
        bamprefix = "Aligned.sortedByCoord.out"
    else:
        bamprefix = "Aligned.out"

    if snakemake.output.get("bam"):
        shell("cat {tmpdir}/{bamprefix}.bam > {snakemake.output.bam:q}")
    if snakemake.output.get("sam"):
        shell("cat {tmpdir}/{bamprefix}.sam > {snakemake.output.sam:q}")
    if snakemake.output.get("reads_per_gene"):
        shell("cat {tmpdir}/ReadsPerGene.out.tab > {snakemake.output.reads_per_gene:q}")
    if snakemake.output.get("chim_junc"):
        shell("cat {tmpdir}/Chimeric.out.junction > {snakemake.output.chim_junc:q}")
    if snakemake.output.get("sj"):
        shell("cat {tmpdir}/SJ.out.tab > {snakemake.output.sj:q}")
    if snakemake.output.get("log"):
        shell("cat {tmpdir}/Log.out > {snakemake.output.log:q}")
    if snakemake.output.get("log_progress"):
        shell("cat {tmpdir}/Log.progress.out > {snakemake.output.log_progress:q}")
    if snakemake.output.get("log_final"):
        shell("cat {tmpdir}/Log.final.out > {snakemake.output.log_final:q}")
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)
ShowHide 29 more snippets with no or duplicated tags.

Free

Created: 8mo ago
Updated: 8mo ago
Maitainers: public
URL: https://github.com/cbp44/snakemake-workflow-rna-seq
Name: snakemake-workflow-rna-seq
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...