simple RNAseq pipeline based on snakemake workflow with extensive QC

public public 1yr ago Version: 3 0 bookmarks

This workflow performs a differential gene expression analysis with STAR and Deseq2.

Usage

The usage of this workflow is described in the

Code Snippets

14
15
16
17
18
19
20
21
22
23
shell:
    "mkdir -p {output} && "
    "STAR --runMode genomeGenerate "
    "--runThreadN {threads} "
    "--genomeDir {output} "
    "--genomeFastaFiles {input.fasta} "
    "--sjdbOverhang {params.overhang} "
    "--sjdbGTFfile {params.gtf} "
    "--outStd Log "
    "&> {log}"
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    shell:
        "STAR --runThreadN {threads} "
        "--genomeDir {input.index} "
        "--readFilesCommand zcat "
        "--readFilesIn {params.fastqs} "
        "--outSAMtype BAM SortedByCoordinate "
        "--quantMode GeneCounts TranscriptomeSAM "
        "--sjdbGTFfile {params.gtf} "
	    "--outSAMunmapped Within "
	    "--outFilterType BySJout "
	    "--outSAMattributes NH HI AS NM MD "
	    "--alignIntronMin 20 "
	    "--alignIntronMax 1000000 "
	    "--alignMatesGapMax 1000000 "
	    "--alignSJoverhangMin 8 "
	    "--alignSJDBoverhangMin 1 " 
        "--outFileNamePrefix {params.outprefix} "
        "--outStd Log && "
        "samtools index {output.bam} && "
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
run:
    data_dict = {}
    cols = ["transcript_id", "gene_id", "length"]
    for data in ['expected_count', 'TPM', 'FPKM']:
        show_output(f"Combining {data} to {output[data]}")
        # load the df of shared rows into the data dict
        data_dict[data] = pd.read_csv(input[0], sep="\t").loc[:, cols]
        # add the 
        data_cols = cols + [data]
        for f in input:
            # retrieve the sample name
            sample_name = f.split("/")[-1].split("-")[0]
            show_output(f"Reading {f}")
            sample_df = pd.read_csv(f, sep="\t").loc[:, data_cols].rename({data:sample_name}, axis=1)
            # print(sample_df.columns)
            data_dict[data] = data_dict[data].merge(sample_df, how="outer")
        data_dict[data].to_csv(output[data], sep="\t", index=False)
        show_output(f"Combined {data} data written to {output[data]}", color="success")
41
42
script:
    "../scripts/count-matrix.py"
14
15
shell:
    "zcat {input.fastq} | fastqc stdin:{params.name} {params.limits_file}-o qc/fastqc/ "  # &>{log} "    
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    shell:
        "multiqc -f -o qc/ -n fastQC --interactive qc/fastqc/; "  # interactive for big number of files
        "rm -f qc/fastqc/*_fastqc.html qc/fastq/*.sub"  # leave the zip files for accumulated multiQC of all processed samples


## RSEQC
rule rseqc_gtf2db:
    output:
        db=rseqc_gene_model("db"),
    log:
        "logs/rseqc_gtf2db.log",
    params:
        gtf = get_gtf()
    conda:
        "../envs/gffutils.yaml"
41
42
script:
    "../scripts/gtf2db.py"
SnakeMake From line 41 of rules/qc.smk
54
55
script:
    "../scripts/gtfdb2bed.py"
SnakeMake From line 54 of rules/qc.smk
73
74
shell:
    "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
SnakeMake From line 73 of rules/qc.smk
92
93
94
shell:
    "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log} 2>&1"
SnakeMake From line 92 of rules/qc.smk
107
108
shell:
    "bam_stat.py -i {input} > {output} 2> {log}"
SnakeMake From line 107 of rules/qc.smk
122
123
shell:
    "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 122 of rules/qc.smk
139
140
shell:
    "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 139 of rules/qc.smk
154
155
shell:
    "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
SnakeMake From line 154 of rules/qc.smk
171
172
shell:
    "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 171 of rules/qc.smk
187
188
shell:
    "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
SnakeMake From line 187 of rules/qc.smk
237
238
wrapper:
    "0.75.0/bio/multiqc"
12
13
14
15
16
17
shell:
    "rsem-calculate-expression "
    "--bam --num-threads {threads} "
    "--paired-end --forward-prob 0.5 "
    "--no-bam-output "
    "{input.tbam} {params.rsem_ref} results/RSEM/{wildcards.sample}-{wildcards.unit}"
41
42
43
shell:
    "cutadapt -j {threads} {params.adapters} --minimum-length 1 -q 20 "
    "-o {output.fastq1} -p {output.fastq2} {input.fastq1} {input.fastq2} > {log}"
59
60
61
shell:
    "cutadapt -j {threads} {params.adapters} --minimum-length 1 -q 20 "
    "-o {output.fastq} {input.fastq1} > {log}"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import sys

# logging
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_matrix(s):

    counts = [
        pd.read_table(
            f, index_col=0, usecols=[0, 1], header=None, skiprows=4
        ) for f in s.input
    ]

    for t, sample in zip(counts, s.params.samples):
        t.columns = [sample]

    matrix = pd.concat(counts, axis=1)
    matrix.index.name = "gene"
    # collapse technical replicates
    matrix = matrix.groupby(matrix.columns, axis=1).sum()
    matrix.to_csv(s.output[0], sep="\t")


get_matrix(snakemake)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
import gffutils

db = gffutils.create_db(
    snakemake.params.gtf,
    dbfn=snakemake.output.db,
    force=True,
    keep_order=True,
    merge_strategy="merge",
    sort_attribute_values=True,
    disable_infer_genes=True,
    disable_infer_transcripts=True,
)
1
2
3
4
5
6
7
8
9
import gffutils

db = gffutils.FeatureDB(snakemake.input.db, keep_order=True)

with open(snakemake.output.bed, "w") as outfileobj:
    for tx in db.features_of_type("transcript", order_by="start"):
        bed = [s.strip() for s in db.bed12(tx).split("\t")]
        bed[3] = tx.id
        outfileobj.write("{}\n".format("\t".join(bed)))
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}"
)
ShowHide 16 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/Mar111tiN/RNAseq
Name: rnaseq
Version: 3
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...