A snakemake pipeline to analyze RNA-seq expression data

public 1yr ago Version: 2 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

A snakemake pipeline to analyze RNA-seq expression data

Preinstallation of HTSlib library for MAJIQ and VOILA

wget

Code Snippets

shell:
    "STAR --runThreadN {threads} "
    "--runMode genomeGenerate "
    "--genomeDir {output} "
    "--genomeFastaFiles {input.fasta} "
    "--sjdbGTFfile {input.gtf} "
    "--sjdbOverhang 99 "
    "&> {log}"

SnakeMake STAR From line 15 of rules/align.smk

shell:
    "STAR --runMode alignReads "
    "--genomeDir {input.idx} "
    "--readFilesIn {input.fq1} {input.fq2} "
    "--runThreadN {threads} "
    "--readFilesCommand zcat --outReadsUnmapped Fastx "
    "--outFilterType BySJout --outStd Log --outSAMunmapped None "
    "--outSAMtype BAM SortedByCoordinate "
    "--outFileNamePrefix {params.out_prefix} "
    "{params.extra} "
    "&> {log}"

SnakeMake STAR From line 42 of rules/align.smk

shell:
    "samtools view -b -F 256 -o {output} {input} "
    "&> {log}"

SnakeMake SAMtools From line 66 of rules/align.smk

shell:
    "samtools index {input} "
    "&> {log}"  

SnakeMake SAMtools From line 81 of rules/align.smk

shell:
    "bedtools genomecov -bg -split -ibam {input} | sort -k1,1 -k2,2n > {output}"

SnakeMake BEDTools From line 94 of rules/align.smk

script:
    "../scripts/DESeq2_kallisto_tximport.R"

SnakeMake From line 20 of rules/diffexp.smk

script:
    "../scripts/volcano_plot.py"

SnakeMake From line 41 of rules/diffexp.smk

script:
    "../scripts/GO_bar_charts.py"

SnakeMake From line 55 of rules/diffexp.smk

script:
    "../scripts/GO_bar_charts.py"

SnakeMake From line 69 of rules/diffexp.smk

shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"

SnakeMake FastQC From line 15 of rules/fastqc.smk

shell:
    "touch {output.html} && touch {output.zip} && "
    "mv {input.html} {output.html} && "
    "mv {input.zip} {output.zip}"

SnakeMake From line 29 of rules/fastqc.smk

shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"

SnakeMake FastQC From line 48 of rules/fastqc.smk

shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"

SnakeMake FastQC From line 69 of rules/fastqc.smk

shell:
    "picard CollectInsertSizeMetrics "
    "--INPUT {input} "
    "--OUTPUT {output.txt} "
    "--Histogram_FILE {output.pdf} "
    "{params} "
    "&> {log}"

SnakeMake Picard From line 15 of rules/metrics.smk

shell:
    "gffread {input.gtf} -g {input.genome} -w {output}"

SnakeMake gffread From line 11 of rules/quantification.smk

shell:
    "kallisto index "
    "--index={output} "
    "--kmer-size={params} "
    "{input} "
    "&> {log}"

SnakeMake kallisto From line 27 of rules/quantification.smk

shell:
    "kallisto quant "
    "--bias "
    "--index={input.idx} "
    "--output-dir={params.outdir} "
    "--bootstrap-samples={params.bootstrap} "
    "--threads={threads} "
    "{input.fq1} {input.fq2} "
    "&> {log}"

SnakeMake Quant kallisto From line 52 of rules/quantification.smk

script:
    "../scripts/tx2gene.py"

SnakeMake From line 71 of rules/quantification.smk

shell:
    "grep \'protein_coding\' {input} > {output}"

SnakeMake From line 83 of rules/quantification.smk

script:
    "../scripts/merge_kallisto_quant.py"

SnakeMake From line 99 of rules/quantification.smk

script:
    "../scripts/pca.py"

SnakeMake From line 116 of rules/quantification.smk

shell:
    "multiqc {params.scan_dir} "
    "--outdir {params.outdir} "
    "&> {log}"

SnakeMake MultiQC From line 134 of rules/quantification.smk

shell:
    "source {params.majiq} && "
    "majiq build {input.gff3} -c {params.config} -j 8 -o {params.outdir} "
    "&> {log} && deactivate"

SnakeMake MAJIQ-SPEL From line 17 of rules/splicing.smk

shell:
    "source {params.majiq_tool} && "
    "majiq deltapsi -grp1 {params.grp1_majiq_files} -grp2 {params.grp2_majiq_files} "
    "-j 8 -o {params.outdir} --name {params.group} "
    "&> {log} && deactivate"

SnakeMake MAJIQ-SPEL From line 37 of rules/splicing.smk

shell:
    "source {params.majiq_tool} && "
    "voila modulize {input.splicegraph} {input.voila} "
    "-d {params.outdir} -j 8 --overwrite --decomplexify-deltapsi-threshold 0.05 "
    "&> {log} && deactivate"

SnakeMake voila From line 56 of rules/splicing.smk

script:
    "../scripts/voila_figures.py"

SnakeMake From line 81 of rules/splicing.smk

shell:
    "rmats.py --b1 {input.group1} --b2 {input.group2} "
    "--gtf {input.gtf} -t paired --readLength {params.readlength} "
    "--nthread 4 --od {output.outdir} --tmp {output.tmpdir}"

SnakeMake rmats From line 102 of rules/splicing.smk

script:
    "../scripts/filter_rmats.py"

SnakeMake From line 125 of rules/splicing.smk

shell:
    "trimmomatic PE "
    "-threads {threads} "
    "{params.extra} "
    "{input.r1} {input.r2} "
    "{output.r1} {output.unpaired_r1} "
    "{output.r2} {output.unpaired_r2} "
    "{params.trimmer} "
    "&> {log}"

SnakeMake Trimmomatic From line 21 of rules/trim.smk

library(readr)
library(tximport)
library(DESeq2)

# Variables coming from the snakemake
kallisto_dir <- snakemake@params[["kallisto_dir"]]
output_dir <- snakemake@output[["results"]]
design_file <- snakemake@input[["samples"]]
comparison_file <- snakemake@input[["comparisons"]]
transcript_gene_file <- snakemake@input[["gene_id"]]

# create the directory that will contain the results
dir.create(output_dir, showWarnings=FALSE)

#############################################################
#------------- Importing data and information ---------------
#############################################################
# Loading samples information from the design file.
sampleTable <- read.table(
    design_file, header=TRUE,
    row.names="sample", check.names=FALSE
)
conditions <- unique(sampleTable$condition)
samples <- rownames(sampleTable)
sampleTable$condition <- factor(sampleTable$condition, levels=conditions)

# Loading the comparisons
comparisons <- read.table(
    comparison_file, header=TRUE
)

# Read the transcript-gene matrix
tx2gene <- read_tsv(transcript_gene_file, col_names=c('TXNAME', 'GENEID'))

# Get the h5 files for all conditions
files <- file.path(kallisto_dir, samples, "abundance.h5")
names(files) <- samples
txi.kallisto <- tximport(files, type="kallisto", tx2gene=tx2gene) # for gene differential analysis
# txi.kallisto <- tximport(files, type="kallisto", txOut=TRUE) # for transcript differential analysis


#############################################################
#--------------- Creating the DESeq2 object -----------------
#############################################################
# Create the DESeq2 object with all the conditions
dds <- DESeqDataSetFromTximport(
    txi.kallisto,
    sampleTable,
    ~condition
)

# pre filtering the count
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# put the levels for each columns
dds$condition <- factor(dds$condition, conditions)

# call the function for differential gene expression analysis
dds <- DESeq(dds)


#############################################################
#-------------- Looping over all comparisons ----------------
#############################################################
for (row in 1:nrow(comparisons)) {

    condition1 <- comparisons[row, "cdn1"]
    condition2  <- comparisons[row, "cdn2"]
    exp <- sprintf("%s-%s", condition1, condition2)

    res <- results(
        dds,
        contrast = c(
            "condition",
            condition2,
            condition1
        ),
        #pAdjustMethod = "fdr"
    )

    # transform the result to data frame
    res_df <- as.data.frame(res)

    # Writing results to file
    fname <- paste(
        output_dir,
        paste(exp, "csv", sep='.'),
        sep='/'
    )

    write.csv(
        res_df,
        file=fname,
        quote=FALSE,
    )
}

R Snakemake DESeq2 readr kallisto tximport From line 7 of scripts/DESeq2_kallisto_tximport.R

import pandas as pd
from pybedtools import BedTool
import os


# Access snakemake inputs, outputs and params
raw_dir = snakemake.params.indir # splicing event dir
out_dir = snakemake.params.outdir
tpm = snakemake.params.tpm
sno_file = snakemake.input.sno
eftud2_file = snakemake.input.eftud2
prpf8_file = snakemake.input.prpf8

SPLICING_EVENTS = ['SE.MATS.JC.txt','A5SS.MATS.JC.txt','A3SS.MATS.JC.txt','MXE.MATS.JC.txt']


def filter_by_threshold(df):
    # Filter rMATS output by FDR and IncLevelDifference
    filtered_df = df[df['FDR']<0.05]
    filtered_df = filtered_df[filtered_df['IncLevelDifference'].abs()>0.20]
    return filtered_df

def filter_by_tpm(splicing_df,tpm_df):
    # Keep genes that are expressed in at least one condition (NC5 & SNORD22 KD in kallisto TPM matrix)
    expressed_df = pd.DataFrame(columns=splicing_df.columns)
    # List of unique spliced genes
    uniq = set(splicing_df['GeneID'].values.tolist())
    for gene in uniq:
        gene_tpm_df = tpm_df[tpm_df['gene']==gene]
        if len(gene_tpm_df)>0: # TPM >=1 in at least one sample
            gene_splicing_df = splicing_df[splicing_df['GeneID']==gene]
            expressed_df = pd.concat([expressed_df,gene_splicing_df],ignore_index=True)
    return expressed_df

def parse_splicing(df,type):
    # Transform splicing df to bed
    df['score']=3 # arbitary score for bed format
    if type == 'SE' or type == 'MXE':
        df = df[['chr','upstreamES','downstreamEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
        df.rename(columns={"upstreamES": "start", "downstreamEE": "end"},inplace=True)
    else: # A3SS or A5SS
        pos_df = df[df['strand']=='+']
        neg_df = df[df['strand']=='-']
        if type == 'A3SS':
            pos_df = pos_df[['chr','flankingES','shortEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            pos_df.rename(columns={"flankingES": "start", "shortEE": "end"},inplace=True)
            neg_df = neg_df[['chr','longExonStart_0base','flankingEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            neg_df.rename(columns={"longExonStart_0base": "start", "flankingEE": "end"},inplace=True)
        else: # A5SS
            pos_df = pos_df[['chr','longExonStart_0base','flankingEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            pos_df.rename(columns={"longExonStart_0base": "start", "flankingEE": "end"},inplace=True)
            neg_df = neg_df[['chr','flankingES','shortEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            neg_df.rename(columns={"flankingES": "start", "shortEE": "end"},inplace=True)
        df = pd.concat([pos_df,neg_df],ignore_index=True)
    return df

def binding_ovlp(df1,df2):
    # bedtools intersect between binding interactions
    bed1 = BedTool.from_dataframe(df1)
    bed2 = BedTool.from_dataframe(df2)
    intersection_df = bed1.intersect(bed2,s=True).to_dataframe()
    return intersection_df

def spliced_and_bound(splicing_df,event_type,binding_df):
    # bedtools intersect splicing events with binding interactions
    if 'start' not in splicing_df.columns:
        splicing_df = parse_splicing(splicing_df,event_type)
    splicing_bed = BedTool.from_dataframe(splicing_df)
    binding_bed = BedTool.from_dataframe(binding_df)
    intersection_df = splicing_bed.intersect(binding_bed, s=True,u=True).to_dataframe()
    if len(intersection_df)>0:
        intersection_df.columns = splicing_df.columns
        intersection_df = intersection_df.sort_values(by=['IncLevelDifference','FDR'],key=abs,ascending=[False,True]).drop_duplicates(subset=['start','end'],keep='last')
    return intersection_df

def get_full_coordinates(filtered_df,org_df):
    # Get full information on filtered splicing events
    final_df = pd.DataFrame(columns=org_df.columns)
    for i in range(len(filtered_df)):
        gene = filtered_df.iloc[i]['geneSymbol']
        fdr = filtered_df.iloc[i]['FDR']
        lvldiff = filtered_df.iloc[i]['IncLevelDifference']
        row = org_df[(org_df['geneSymbol']==gene)&(org_df['FDR']==fdr)&(org_df['IncLevelDifference']==lvldiff)]
        final_df = pd.concat([final_df,row],ignore_index=True)
    final_df.drop('score', axis=1,inplace=True)
    return final_df

# Kallisto TPM matrix
tpm_df = pd.read_csv(tpm,sep='\t')
# snoRNA binding interactions
sno_df = pd.read_csv(sno_file,sep='\t')
# RBP binding interactions
eftud2_df = pd.read_csv(eftud2_file,sep='\t')
prpf8_df = pd.read_csv(prpf8_file,sep='\t')

# Overlap of binding interactions
sno_eftud2_df = binding_ovlp(sno_df,eftud2_df)
sno_prpf8_df = binding_ovlp(sno_df,prpf8_df)
sno_eftud2_prpf8_df = binding_ovlp(sno_eftud2_df,prpf8_df)

# For each splicing event type, FILTER and find events bound by snoRNA and RBPs
for event in SPLICING_EVENTS:
    print(event)
    file = os.path.join(raw_dir, event)
    event_type = event.split(".")[0]
    df = pd.read_csv(file, sep='\t')
    filtered = filter_by_threshold(df)
    filtered = filter_by_tpm(filtered,tpm_df)

    # Bound by snoRNA
    print('Bound by SNORD22')
    sno_bound = spliced_and_bound(filtered,event_type,sno_df)
    get_full_coordinates(sno_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22.tsv'),sep='\t',index=None)
    # Bound by snoRNA and EFTUD2
    print('Bound by SNORD22 and EFTUD2')
    sno_eftud2_bound = spliced_and_bound(filtered,event_type,sno_eftud2_df)
    get_full_coordinates(sno_eftud2_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_EFTUD2.tsv'),sep='\t',index=None)
    # Bound by snoRNA and PRPF8
    print('Bound by SNORD22 and PRPF8')
    sno_prpf8_bound = spliced_and_bound(filtered,event_type,sno_prpf8_df)
    get_full_coordinates(sno_prpf8_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_PRPF8.tsv'),sep='\t',index=None)
    # Bound by snoRNA, EFTUD2 and PRPF8
    print('Bound by SNORD22, EFTUD2 and PRPF8')
    sno_eftud2_prpf8_bound = spliced_and_bound(filtered,event_type,sno_eftud2_prpf8_df)
    get_full_coordinates(sno_eftud2_prpf8_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_EFTUD2_PRPF8.tsv'),sep='\t',index=None)
    print('Done')

Python Snakemake Pandas BEDTools kallisto Pybedtools From line 3 of scripts/filter_rmats.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gprofiler import GProfiler

# Get list of genes
genes = list(pd.read_csv(snakemake.input.genes, sep='\t')['gene'])

# Run gprofiler for enrichment analysis of functional (GO and other) terms
### NEED INTERNET CONNECTION!
gp = GProfiler(return_dataframe=True)
go_results = gp.profile(organism='hsapiens', query=genes)

# Log-transform p-values
go_results['negative_log10_of_p_value'] = np.negative(np.log10(go_results['p_value']))

# Extract GO terms of interest
go_mf = go_results[go_results['source']=='GO:MF'].sort_values(by=['negative_log10_of_p_value'])
go_cc = go_results[go_results['source']=='GO:CC'].sort_values(by=['negative_log10_of_p_value'])
go_bp = go_results[go_results['source']=='GO:BP'].sort_values(by=['negative_log10_of_p_value'])

go_terms = {"GO:BP":go_bp,"GO:MF":go_mf,"GO:CC":go_cc}
colours = {"GO:BP":"indianred","GO:MF":"limegreen","GO:CC":"steelblue"}

go_concat = pd.DataFrame()
colours_present = {}

# Only keep GO term types that contain at least one GO term of that type
for term in go_terms.keys():
    if len(go_terms[term]) > 0:
        go_concat = pd.concat([go_concat,go_terms[term]], ignore_index=True)
        colours_present.update({term:colours[term]})

# Represent GO results with a bar chart
plt.figure(figsize=(9,4))

bars = pd.DataFrame({'source':go_concat.source.values.tolist(),'p_val':go_concat.negative_log10_of_p_value.values.tolist()},
    index=go_concat.name.values.tolist())

bars['p_val'].plot(kind="barh",color=bars['source'].replace(colours_present),width=0.7)
plt.title('GO Analysis of genes')
plt.xlabel('$-log_{10}$(p-value)')

labels = list(colours_present.keys())
handles = [plt.Rectangle((0,0),1,1, color=colours_present[label]) for label in labels]
plt.legend(handles,labels,loc='lower right')

plt.tight_layout()
plt.savefig(snakemake.output.bar_chart)

Python Pandas numpy matplotlib gProfileR From line 3 of scripts/GO_bar_charts.py

import pandas as pd
from gtfparse import read_gtf
import os

tx2gene = snakemake.input.tx2gene
gtf = snakemake.input.gtf
outfile = snakemake.output.tpm

final_df = pd.DataFrame()
cycle = 1
sample_list = []

for q in snakemake.input.quant:

    data = pd.read_csv(q, sep='\t', usecols=['target_id','tpm'])

    # get sample name
    sample = os.path.basename(os.path.dirname(q))

    # reformat dataframe
    data.set_index('target_id', inplace=True)
    data.rename(columns={"tpm":sample}, inplace=True)

    # Merge dataframes
    if cycle == 1:
        final_df = data
    else:
        final_df = pd.merge(final_df, data, left_index=True, right_index=True)

    sample_list += [sample]
    cycle+=1

# transcript ID --> gene ID
ids = pd.read_csv(tx2gene, sep='\t', names=['transcript','gene'])
ids.set_index('transcript', inplace=True, drop=False)

final_df = pd.merge(final_df, ids, left_index=True, right_index=True)
final_df.set_index('gene', inplace=True)

# Filter genes by TPM
# TPM >=1 in at least one sample
filtered = final_df[~((final_df[sample_list]<1).all(axis=1))]

# Filter genes by biotype
# only keep protein coding genes
df_gtf = read_gtf(gtf)
filtered_pc = filtered[filtered.index.isin(df_gtf.gene_id)]

# Add gene name
id_name = df_gtf[['gene_id','gene_name']].drop_duplicates(ignore_index=True)

index = filtered_pc.index.tolist()
names =[]

for i in range(len(index)):
    id = index[i]
    name = id_name[id_name['gene_id']==id].iloc[0]['gene_name']
    names.append(name)

print(names)
filtered_pc['gene_name'] = names
cols = filtered_pc.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols = cols[-1:] + cols[:-1]
filtered_pc = filtered_pc[cols]

# Write to file
filtered_pc.to_csv(outfile, sep='\t')

Python Pandas gtfparse From line 3 of scripts/merge_kallisto_quant.py

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re

df = pd.read_csv(snakemake.input.tpm, sep='\t', index_col='gene')
df = df.drop(columns=['transcript', 'gene_name'])
df = df.T

# Standardize the values (remove mean and divide by stdev)
X = StandardScaler().fit_transform(df)

# Initialize pca
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
principal_df = pd.DataFrame(data = principal_components, columns = ['PC1', 'PC2'])
principal_df['sample'] = df.index

# Modify labels for legend
def legend_text(tuples):
    labels = []
    for t in tuples:
        if 'NC' in t[0]:
            labels.append(t[0])
        else:
            snoKD = 'SNORD'+t[0]+'_ASO'+str(t[1])
            labels.append(snoKD)
    return labels

# Add condition and sample information to the PCA dataframe
design = pd.read_csv(snakemake.params.design, sep='\s+')
tup = design[['condition','ASO']].apply(tuple, axis=1)
principal_df['label'] = legend_text(tup)

var1, var2 = round(pca.explained_variance_ratio_[0], 4) * 100, round(pca.explained_variance_ratio_[1], 4) * 100

# Create color palette for the samples
def color_palette(labels):
    palette = []
    for l in labels:
        if 'NC' in l and 'dimgray' not in palette:
            palette.append('dimgray')
        elif 'ASO1' in l and 'seagreen' not in palette: # blue, seagreen
            palette.append('seagreen')
        elif 'ASO2' in l and 'mediumseagreen' not in palette: # royalblue, mediumseagreen
            palette.append('mediumseagreen')
        else:
            continue
    return palette

# Create pca_plot function
def pca_plot(df, x_col, y_col, hue_col, xlabel, ylabel, title, path, **kwargs):

    # Creates a PCA (scatter) plot (using a x, y and hue column).

    #sns.set_theme()
    plt.figure(figsize=(5.5,4))
    plt.rcParams['svg.fonttype'] = 'none'
    plt.rcParams["legend.loc"] = 'upper right'

    plt.suptitle(title, fontsize=16)
    sns.scatterplot(data=df, x=x_col, y=y_col, hue=hue_col, palette=color_palette(df[hue_col]), edgecolor='face',
                    alpha=0.7, s=50, **kwargs)

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    plt.legend(fontsize='medium')

    plt.savefig(path, bbox_inches='tight', dpi=600)

# Create PCA scatter plot
pca_plot(principal_df, 'PC1', 'PC2', 'label', f'PC1 ({var1:.2f}%)', f'PC2 ({var2:.2f}%)',
        'PCA plot based on scaled TPM', snakemake.output.plot)

principal_df.to_csv(snakemake.output.tsv, sep='\t', index=False)

Python Pandas numpy matplotlib seaborn sklearn From line 5 of scripts/pca.py

import pandas as pd
import re

gtf_file = snakemake.input.gtf
out_file = snakemake.output.tsv

def transcript2gene(gtf_file):
    # Creating a transcript -> gene dictionary
    tr_dict = dict()
    with open(str(gtf_file)) as f:
        for line in f:
            trans_id = ''
            gene_id = ''

            if 'gene_id' in line:
                gene_id = re.search(
                    r'gene_id "(.*?)"[;,]', line
                ).group(1)
                gene_id = gene_id.split(',')[0]

            if 'transcript_id' in line:
                trans_id = re.search(
                    r'transcript_id "(.*?)"[;]', line
                ).group(1)

            # Insert in tr dict
            if trans_id and gene_id and trans_id not in tr_dict:
                tr_dict[trans_id] = gene_id

            gene_id = ''
            trans_id = ''
    return tr_dict

# Generate the dictionary
_dict = transcript2gene(gtf_file)

with open(out_file, 'w') as f:
    for key, value in _dict.items():
        f.write(str(key) + '\t' + value + '\n')

Python Pandas From line 5 of scripts/tx2gene.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pybedtools import BedTool
import os


# Access snakemake inputs, outputs and params
raw_dir = snakemake.params.indir
out_dir = snakemake.params.outdir
exp_genes = snakemake.input.exp_genes
DE_up = snakemake.input.DE_up
DE_down = snakemake.input.DE_down
outfile = snakemake.output.bar_chart
sno = snakemake.params.sno[0]
sno_interactions_dir = snakemake.params.sno_interactions

SPLICING_EVENTS = ['alt3and5prime','alt3prime','alt5prime','alternate_first_exon','alternate_last_exon','alternative_intron',
                    'cassette','multi_exon_spanning','mutually_exclusive','p_alt3prime','p_alt5prime',
                    'p_alternate_first_exon','p_alternate_last_exon','tandem_cassette']


def filter_genes(raw_dir,out_dir,exp_genes):
    # Group by event_id and gene_id
    event_id_count_dic = {}
    gene_id_count_dic = {}
    gene_lists = {}

    for filename in os.listdir(raw_dir):
        f = os.path.join(raw_dir, filename)
        # checking if it is a file
        if os.path.isfile(f) and ".tsv" in filename:

            voila_df = pd.read_csv(f, sep='\t', comment='#')

            if len(voila_df) > 0:
                genes = pd.read_csv(exp_genes, sep='\t', usecols=['gene'])
                outfile = os.path.join(out_dir, filename)

                # Drop genes that are not in the TPM filtered list
                voila_filtered = voila_df[voila_df.gene_id.isin(genes.gene)] 

                # Write to file
                voila_filtered.to_csv(outfile, sep='\t', index=False)

                if filename not in ['heatmap.tsv','junctions.tsv','summary.tsv','other.tsv']:
                    event = filename.split('.')[0]
                    event_id_count_dic[event] = len(pd.unique(voila_filtered['event_id']))
                    unique_genes = pd.unique(voila_filtered['gene_id'])
                    gene_id_count_dic[event] = len(unique_genes)
                    gene_lists[event] = list(unique_genes)

    event_id_count_df = pd.DataFrame({'Event': event_id_count_dic.keys() , 'Count': list(event_id_count_dic.values())})
    event_id_count_df = event_id_count_df.sort_values(by=['Count'], ascending=True)
    gene_id_count_df = pd.DataFrame({'Event': gene_id_count_dic.keys() , 'Count': list(gene_id_count_dic.values())})
    gene_id_count_df = gene_id_count_df.sort_values(by=['Count'], ascending=True)

    return event_id_count_df, gene_id_count_df, gene_lists

def splicing_and_DE(spliced_genes, DE_genes):

    common_count = {}

    # List of genes that are DE
    DE_df = pd.read_csv(DE_genes, sep='\t', usecols=['gene'])

    # For each splicing event type, find genes that are also DE
    for e in list(spliced_genes.keys()):
        common_count[e] = len(set(spliced_genes[e]) & set(DE_df['gene'].values.tolist()))

    df_common = pd.DataFrame({'Event': common_count.keys() , 'Count': list(common_count.values())})
    df_common = df_common.sort_values(by=['Count'], ascending=True)

    return df_common

def parse_sno(sno_dir,sno_id):

    # Prep sno interactions file for bedtools intersect
    sno_file = os.path.join(sno_dir, sno_id + '.bed')
    sno_df = pd.read_csv(sno_file, sep='\t', 
        names=['seqname', 'target_window_start', 'target_window_end', 'snoid', 'count', 'strand',
            'sno_window_start', 'sno_window_end', 'mean_score', 'min_score', 'max_score', 'target'])

    # Drop unnecessary columns for bedtools intersect
    sno_df.drop(columns=['snoid', 'sno_window_start', 'sno_window_end', 'mean_score', 'min_score', 'max_score'], inplace=True)
    sno_df = sno_df[['seqname', 'target_window_start', 'target_window_end', 'target', 'count', 'strand']]

    # Add 'chr' to seqname if not already present
    sno_df = sno_df.astype({'seqname':'string'})
    if 'chr' not in sno_df.iloc[0]['seqname']:
        sno_df['seqname'] = 'chr' + sno_df['seqname']

    return sno_df

def parse_voila(df):

    # Create a dictionary of lists
    # KEY: event_id  VALUE: list of start and end positions of exons
    pos_dic = {}

    for i in range(len(df)):

        event_id = df.iloc[i]['event_id']
        for t in ['reference_exon_coord','spliced_with_coord','junction_coord']:
            if isinstance(df.iloc[i][t], str):
                coord = df.iloc[i][t].split("-") # list
                for el in coord:
                    if el in ['na','']:
                        coord.remove(el)
                    else:
                        el = int(el) # convert strings to ints
                if '1' in coord:
                    coord.remove('1')
                if event_id in pos_dic:
                    pos_dic[event_id] = pos_dic[event_id] + coord
                else:
                    pos_dic[event_id] = coord

    # Reformat data to produce a bed file
    bed_df = pd.DataFrame(columns=['seqid', 'start', 'end', 'gene_name', 'strand'])

    info_df = df.drop_duplicates(subset=['event_id']) # df to get info from

    for e in pos_dic.keys():

        # Grep info
        event_info = info_df[info_df['event_id'] == e]
        gene_name = event_info.iloc[0]['gene_name']
        seq_id = event_info.iloc[0]['seqid']
        strand = event_info.iloc[0]['strand']
        new_event = pd.DataFrame({"gene_name":gene_name,'seqid':seq_id,'strand':strand,'start':[min(pos_dic[e])],'end':[max(pos_dic[e])]})
        bed_df = pd.concat([bed_df, new_event])

    bed_df['count']=3 # arbitrary value to meet bed format
    bed_df = bed_df[['seqid', 'start', 'end', 'gene_name', 'count', 'strand']] # reorder columns

    # Add 'chr' to seqid if not already present
    bed_df = bed_df.astype({'seqid':'string'})
    if 'chr' not in bed_df.iloc[0]['seqid']:
        bed_df['seqid'] = 'chr' + bed_df['seqid']

    return bed_df

def splicing_and_sno_binding(sno_interactions_dir, sno, splicing_dir):

    common_count_event_id = {}
    common_count_gene = {}

    sno_df = parse_sno(sno_interactions_dir, sno) # sno interaction file

    # For each splicing event type, find splicing events/genes that are also bound by the snoRNA
    print('Splicing events bound by snoRNA')
    for filename in os.listdir(splicing_dir):
        if '.tsv' in filename and filename.split(".")[0] in SPLICING_EVENTS:

            f = os.path.join(splicing_dir, filename)
            df = pd.read_csv(f, sep='\t', usecols=['gene_id','gene_name','seqid','strand','event_id',
                                                    'reference_exon_coord','spliced_with_coord','junction_coord'])

            if len(df) > 0:

                splicing_df = parse_voila(df)

                splicing_bed = BedTool.from_dataframe(splicing_df)
                sno_bed = BedTool.from_dataframe(sno_df)

                df_intersect = splicing_bed.intersect(sno_bed, s=True).to_dataframe()
                if len(df_intersect) > 0:
                    # drop duplicate rows
                    df_intersect.drop_duplicates(inplace=True)
                    pd.set_option('display.max_rows', None) # no limit for displaying df rows
                    print(filename)
                    print(df_intersect)
                    common_count_event_id[filename.split(".")[0]] = len(df_intersect)
                    df_intersect.columns = ['seqid','start','end','gene_name','count','strand']
                    df_intersect_by_gene = df_intersect.drop_duplicates(subset=['gene_name'])
                    common_count_gene[filename.split(".")[0]] = len(df_intersect_by_gene)
                else:
                    common_count_event_id[filename.split(".")[0]] = 0
                    common_count_gene[filename.split(".")[0]] = 0

            else:
                common_count_event_id[filename.split(".")[0]] = 0
                common_count_gene[filename.split(".")[0]] = 0

    df_common_event_id = pd.DataFrame({'Event': common_count_event_id.keys() , 'Count': list(common_count_event_id.values())})
    df_common_event_id = df_common_event_id.sort_values(by=['Count'], ascending=True)
    df_common_gene = pd.DataFrame({'Event': common_count_gene.keys() , 'Count': list(common_count_gene.values())})
    df_common_gene = df_common_gene.sort_values(by=['Count'], ascending=True)

    return df_common_event_id, df_common_gene

# Create various horizontal bar charts
def create_figures(event_df, gene_df, gene_lst, DE_up_file, DE_down_file, outfile, sno, sno_interactions_dir, splicing_dir):

    fig, axs = plt.subplots(3, 2, figsize=(10,9))

    # Number of splicing events by event_id
    axs[0,0].barh(event_df['Event'],event_df['Count'], color='dimgray')
    axs[0,0].set_title('Splicing events',size=14)
    axs[0,0].set(xlabel="Number of events")

    # Number of genes per splicing event
    axs[0,1].barh(gene_df['Event'],gene_df['Count'], color='dimgray')
    axs[0,1].set_title('Spliced genes',size=14)
    axs[0,1].set(xlabel="Number of genes")

    # Number of genes that are spliced and DE (downregulated) 
    df_spliced_DE_down = splicing_and_DE(gene_lst, DE_down_file)
    axs[1,0].barh(df_spliced_DE_down['Event'],df_spliced_DE_down['Count'], color='dimgray')
    axs[1,0].set_title('Spliced and downregulated genes',size=14)
    axs[1,0].set(xlabel="Number of genes")

    # Number of genes that are spliced and DE (upregulated)
    df_spliced_DE_up = splicing_and_DE(gene_lst, DE_up_file)
    axs[1,1].barh(df_spliced_DE_up['Event'],df_spliced_DE_up['Count'], color='dimgray')
    axs[1,1].set_title('Spliced and upregulated genes',size=14)
    axs[1,1].set(xlabel="Number of genes")

    # Number of splicing events/genes that are bound by sno
    df_common_event_id, df_common_gene = splicing_and_sno_binding(sno_interactions_dir, sno, splicing_dir)
    axs[2,0].barh(df_common_event_id['Event'],df_common_event_id['Count'], color='dimgray')
    axs[2,0].set_title('Splicing events bound by snoRNA',size=14)
    axs[2,0].set(xlabel="Number of overlaps")
    axs[2,1].barh(df_common_gene['Event'],df_common_gene['Count'], color='dimgray')
    axs[2,1].set_title('Spliced genes bound by snoRNA',size=14)
    axs[2,1].set(xlabel="Number of genes")

    fig.suptitle(sno+' KD',fontsize=16)

    plt.tight_layout()
    plt.savefig(outfile)

    return


# Call functions
event_id_count_df, gene_id_count_df, gene_lists = filter_genes(raw_dir,out_dir,exp_genes)
create_figures(event_id_count_df, gene_id_count_df, gene_lists, DE_up, DE_down, outfile, sno, sno_interactions_dir, out_dir)

Python Snakemake Pandas numpy BEDTools matplotlib Pybedtools From line 3 of scripts/voila_figures.py

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gtfparse import read_gtf
import numpy as np

# Get list of genes after TPM filtering
genes_file = snakemake.input.filtered_genes
genes = pd.read_csv(genes_file, sep='\t', usecols=['gene'])

# Configure comparison, pval_threshold and colors
comp1, comp2 = str(snakemake.wildcards.comp).split('-')

if 'NC' not in comp1:
        comp1 = 'SNORD'+comp1+' KD'
if 'NC' not in comp2:
        comp2 = 'SNORD'+comp2+' KD'

#DE_tool, quantifier = str(snakemake.wildcards.DE_tool), str(snakemake.wildcards.quant)
pval_threshold = snakemake.params.pval_threshold
#colors = {'|log2FC| > 1 & padj < '+str(pval_threshold): 'blue','n.s.': 'grey'}
colors = {'log2FC < -1 & padj < '+str(pval_threshold): 'cornflowerblue','log2FC > 1 & padj < '+str(pval_threshold): 'firebrick','n.s.': 'grey'}

# Load DE df
df = pd.read_csv(snakemake.input.DE_output)

# Rename columns
df.set_axis(['gene','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj'], axis=1, inplace=True)

# Drop genes/transcripts with NaN in log2FoldChange, pvalue and/or padj
df = df.dropna(subset=['log2FoldChange', 'pvalue', 'padj'])

# Drop genes that are not in the TPM filtered list
df = df[df.gene.isin(genes.gene)] 

# Filter genes by biotype
# only keep protein coding genes
gtf = snakemake.input.gtf
df_gtf = read_gtf(gtf)
df = df[df.gene.isin(df_gtf.gene_id)]

# Create -log10padj column
df['-log10padj'] = -np.log10(df['padj'])

# Create hue column for significative points (|log2FC| > 1 & padj<0.05)
df.loc[(df['padj'] < pval_threshold) & (df['log2FoldChange'] > 1),
        'sig.'] = 'log2FC > 1 & padj < '+str(pval_threshold)
df.loc[(df['padj'] < pval_threshold) & (df['log2FoldChange'] < -1),
        'sig.'] = 'log2FC < -1 & padj < '+str(pval_threshold)
df['sig.'] = df['sig.'].fillna('n.s.')

# Extract genes that are significant
outfile_up = snakemake.output.up_genes
outfile_down = snakemake.output.down_genes
df_genes = df[~df['sig.'].str.contains('n.s.')]
df_genes = df_genes[['gene','log2FoldChange','padj']]

up = df_genes[df_genes['log2FoldChange']>0]
up.sort_values('log2FoldChange',inplace=True,ascending=False)
down = df_genes[df_genes['log2FoldChange']<0]
down.sort_values('log2FoldChange',inplace=True,ascending=False)
up.to_csv(outfile_up, sep='\t', index=False)
down.to_csv(outfile_down, sep='\t', index=False)

# Create volcano function
def volcano(df, x_col, y_col, hue_col, xlabel, ylabel, title, color_dict, path,
            pval_threshold, **kwargs):
    """
    Creates a volcano plot (using a x, y and hue column).
    """
    #sns.set_theme()
    plt.figure(figsize=(5.5,4))
    plt.rcParams['svg.fonttype'] = 'none'

    plt.suptitle(title, fontsize=16)
    sns.scatterplot(data=df, x=x_col, y=y_col, hue=hue_col,
                    palette=color_dict, edgecolor='face',
                    s=25, **kwargs)

    # Add threshold lines (padj)
    plt.axhline(y=-np.log10(pval_threshold), color='black', ls='--', lw=0.5)
    plt.axvline(x=np.log2(2), color='black', ls='--', lw=0.5)
    plt.axvline(x=np.log2(0.5), color='black', ls='--', lw=0.5)

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    leg = plt.legend(fontsize="medium",bbox_to_anchor=(1.6, 0.4), loc='lower right', ncol=1,
        facecolor='white',framealpha=1)
    leg.get_frame().set_linewidth(0)
    plt.savefig(path, bbox_inches='tight', dpi=600)

    return

# Create volcano
volcano(df, 'log2FoldChange', '-log10padj', 'sig.',
        'log2(Fold Change)', '-log10(FDR-adjusted p-value)',
        f'{comp1} vs {comp2} using DESeq2',
        colors, snakemake.output.volcano, pval_threshold)