RNAseq Analysis Workflow from Alignment to DEG Analysis with Snakemake

public 1yr ago Version: 2 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

This workflow performs an RNAseq data analysis from alignement(STAR aligner) to DEG analysis(DESEQ2)

dag1

Install Snakemake and Git

Please make sure that Git and Snakemake are correctly installed

Git: https://anaconda.org/anaconda/git

Snakemake: https://snakemake.readthedocs.io/en/stable/getting_started/installation.html

Clone workflow into your working directory

git clone https://github.com/mhu10/SMK_RNAseq path/to/workdir

Modify filenames of your raw fastq files as needed

The format of your pair-end fastq files must be

prefix_R1.fastq.gz

prefix_R2.fastq.gz

Edit config file and Snakefile file as needed

./SMK_RNAseq/config/'config.yaml

./SMK_RNAseq/Snakefile

Activate snakemake

conda activate snakemake

Dry run workflow

snakemake -n

Execute workflow

snakemake --use-conda --cores 12

Build DAG workflow chart

snakemake --rulegraph | dot -Tsvg > dag1.svg

Code Snippets

samples <- snakemake@input
out_file <- snakemake@output[["outfile"]]
dirpath <- snakemake@params[['outputpath']]


data <- lapply(samples, function(s){
    sname <- strsplit(s, dirpath)[[1]][2]
    sname <- strsplit(sname, "_ReadsPerGene.out.tab")[[1]][1]
    sample_data <- read.table(s, sep="\t", row.names='V1')
    colnames(sample_data) <- rep(sname,3)
    return(sample_data)
})

data <- do.call(cbind, data)
data <- data[,seq(3,length(colnames(data)),3)]

write.table(data, out_file, sep="\t")

R From line 1 of scripts/CombineExp.R

library('DESeq2')
library('ggplot2')
library('dplyr')


SampleName = snakemake@config[['LIST_SAMPLES']]
SampleCondition = snakemake@config[['LIST_SAMPLES_BIO']]
cutoff_1 <- as.numeric(snakemake@config[['Threshold_Padj']])
cutoff_2 <- as.numeric(snakemake@config[['Threshold_logFC']])
out_table <- snakemake@output[["deseq2output"]]
name_ctl = snakemake@config[['CTL_NAME']]
name_stim = snakemake@config[['STIM_NAME']]
datainput <- read.table(snakemake@input[['ExpTable']], sep="\t",check.names=FALSE)

##########################################
datainput = datainput[,SampleName]
rawcounts = datainput[-c(1:4),]
rawcounts = rawcounts[rowSums(rawcounts[])>0,]

######################################

sampleid = SampleName
condition = SampleCondition

coldata = data.frame(condition)
rownames(coldata)=sampleid

dds <- DESeqDataSetFromMatrix(countData = rawcounts,
                              colData = coldata,
                              design = ~ condition)

dds$condition <- relevel(dds$condition, ref = name_ctl)
dds$sampleid <- sampleid

vsd <- vst(dds)


### PCA plot ###

fig_1 <- plotPCA(vsd, intgroup=c("sampleid",'condition'))+theme_classic()+theme(legend.title = element_blank())
ggsave(snakemake@output[["plotPCA"]], plot = fig_1)

#### DEG Analysis
dds <- DESeq(dds)

res_12 <- results(dds,contrast = c('condition',name_stim,name_ctl))
summary(res_12)

res_12_data = data.frame(res_12)
res_12_data = na.omit(res_12_data)

res_12_data$DEG <- 'No'
res_12_data[(res_12_data$padj < cutoff_1) & (res_12_data$log2FoldChange > cutoff_2),'DEG'] <-'Up-regulated'
res_12_data[(res_12_data$padj < cutoff_1) & (res_12_data$log2FoldChange < -cutoff_2),'DEG'] <-'Down-regulated'

res_12_data$DEG <- factor(res_12_data$DEG, levels = c('Up-regulated','Down-regulated','No'))

write.table(res_12_data, out_table, sep="\t")


### Volcano plot ###

fig_2<- ggplot(res_12_data, aes(x=log2FoldChange, y=-log10(padj), col=DEG)) +
  geom_point(size=1,alpha=0.8) + theme_classic()+
  scale_color_manual(values=c('#FF66FF','#0099CC','#CCCCCC'))+
  theme(axis.title = element_text(size=12),
        legend.text = element_text(size=12),legend.title = element_text(size=12))

ggsave(snakemake@output[["plotVolcano"]],device='png',plot = fig_2)

R ggplot2 dplyr DESeq2 PCAtools From line 1 of scripts/DESEQ2.R

wrapper:
    "v1.7.1/bio/fastqc"

SnakeMake FastQC From line 36 of main/Snakefile

wrapper:
    "v1.7.1/bio/trimmomatic/pe"

SnakeMake From line 61 of main/Snakefile

shell:
    "mkdir -p {output.outputdir} && "
    'STAR --runThreadN {threads} '
    '--runMode genomeGenerate '
    '--genomeDir {output.outputdir} '
    '--genomeFastaFiles {input.fa} '
    '--sjdbGTFfile {input.gtf} '
    '--sjdbOverhang 50'

SnakeMake STAR From line 76 of main/Snakefile

shell:
    "STAR "
    "--runThreadN 12 "
    "--sjdbGTFfile {params.gtf} "
    "--genomeDir {params.starindex} "
    "--readFilesIn {input.r1} {input.r2} "
    "--outFileNamePrefix {params.outprefix} "
    "--outSAMattributes Standard "
    "--outSAMtype BAM SortedByCoordinate "
    "--outReadsUnmapped Fastx "
    "--quantMode TranscriptomeSAM GeneCounts "

SnakeMake STAR From line 101 of main/Snakefile

script:
    "scripts/CombineExp.R"

SnakeMake From line 123 of main/Snakefile

script:
    "scripts/DESEQ2.R"

SnakeMake From line 138 of main/Snakefile

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "julianderuiter@gmail.com"
__license__ = "MIT"


from os import path
import re
from tempfile import TemporaryDirectory

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)


def basename_without_ext(file_path):
    """Returns basename of file path, without the file extension."""

    base = path.basename(file_path)
    # Remove file extension(s) (similar to the internal fastqc approach)
    base = re.sub("\\.gz$", "", base)
    base = re.sub("\\.bz2$", "", base)
    base = re.sub("\\.txt$", "", base)
    base = re.sub("\\.fastq$", "", base)
    base = re.sub("\\.fq$", "", base)
    base = re.sub("\\.sam$", "", base)
    base = re.sub("\\.bam$", "", base)

    return base


# Run fastqc, since there can be race conditions if multiple jobs
# use the same fastqc dir, we create a temp dir.
with TemporaryDirectory() as tempdir:
    shell(
        "fastqc {snakemake.params} -t {snakemake.threads} "
        "--outdir {tempdir:q} {snakemake.input[0]:q}"
        " {log}"
    )

    # Move outputs into proper position.
    output_base = basename_without_ext(snakemake.input[0])
    html_path = path.join(tempdir, output_base + "_fastqc.html")
    zip_path = path.join(tempdir, output_base + "_fastqc.zip")

    if snakemake.output.html != html_path:
        shell("mv {html_path:q} {snakemake.output.html:q}")

    if snakemake.output.zip != zip_path:
        shell("mv {zip_path:q} {snakemake.output.zip:q}")

Python Snakemake FastQC From line 3 of fastqc/wrapper.py

__author__ = "Johannes Köster, Jorge Langa"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "koester@jimmy.harvard.edu"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

# Distribute available threads between trimmomatic itself and any potential pigz instances
def distribute_threads(input_files, output_files, available_threads):
    gzipped_input_files = sum(1 for file in input_files if file.endswith(".gz"))
    gzipped_output_files = sum(1 for file in output_files if file.endswith(".gz"))
    potential_threads_per_process = available_threads // (
        1 + gzipped_input_files + gzipped_output_files
    )
    if potential_threads_per_process > 0:
        # decompressing pigz creates at most 4 threads
        pigz_input_threads = (
            min(4, potential_threads_per_process) if gzipped_input_files != 0 else 0
        )
        pigz_output_threads = (
            (available_threads - pigz_input_threads * gzipped_input_files)
            // (1 + gzipped_output_files)
            if gzipped_output_files != 0
            else 0
        )
        trimmomatic_threads = (
            available_threads
            - pigz_input_threads * gzipped_input_files
            - pigz_output_threads * gzipped_output_files
        )
    else:
        # not enough threads for pigz
        pigz_input_threads = 0
        pigz_output_threads = 0
        trimmomatic_threads = available_threads
    return trimmomatic_threads, pigz_input_threads, pigz_output_threads


def compose_input_gz(filename, threads):
    if filename.endswith(".gz") and threads > 0:
        return "<(pigz -p {threads} --decompress --stdout {filename})".format(
            threads=threads, filename=filename
        )
    return filename


def compose_output_gz(filename, threads, compression_level):
    if filename.endswith(".gz") and threads > 0:
        return ">(pigz -p {threads} {compression_level} > {filename})".format(
            threads=threads, compression_level=compression_level, filename=filename
        )
    return filename


extra = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
compression_level = snakemake.params.get("compression_level", "-5")
trimmer = " ".join(snakemake.params.trimmer)

# Distribute threads
input_files = [snakemake.input.r1, snakemake.input.r2]
output_files = [
    snakemake.output.r1,
    snakemake.output.r1_unpaired,
    snakemake.output.r2,
    snakemake.output.r2_unpaired,
]

trimmomatic_threads, input_threads, output_threads = distribute_threads(
    input_files, output_files, snakemake.threads
)

input_r1, input_r2 = [
    compose_input_gz(filename, input_threads) for filename in input_files
]

output_r1, output_r1_unp, output_r2, output_r2_unp = [
    compose_output_gz(filename, output_threads, compression_level)
    for filename in output_files
]

shell(
    "trimmomatic PE -threads {trimmomatic_threads} {java_opts} {extra} "
    "{input_r1} {input_r2} "
    "{output_r1} {output_r1_unp} "
    "{output_r2} {output_r2_unp} "
    "{trimmer} "
    "{log}"
)