A snakemake pipeline to analyze RNA-seq expression data

public public 1yr ago Version: 2 0 bookmarks

A snakemake pipeline to analyze RNA-seq expression data

Preinstallation of HTSlib library for MAJIQ and VOILA

wget 

Code Snippets

15
16
17
18
19
20
21
22
shell:
    "STAR --runThreadN {threads} "
    "--runMode genomeGenerate "
    "--genomeDir {output} "
    "--genomeFastaFiles {input.fasta} "
    "--sjdbGTFfile {input.gtf} "
    "--sjdbOverhang 99 "
    "&> {log}"
42
43
44
45
46
47
48
49
50
51
52
shell:
    "STAR --runMode alignReads "
    "--genomeDir {input.idx} "
    "--readFilesIn {input.fq1} {input.fq2} "
    "--runThreadN {threads} "
    "--readFilesCommand zcat --outReadsUnmapped Fastx "
    "--outFilterType BySJout --outStd Log --outSAMunmapped None "
    "--outSAMtype BAM SortedByCoordinate "
    "--outFileNamePrefix {params.out_prefix} "
    "{params.extra} "
    "&> {log}"
66
67
68
shell:
    "samtools view -b -F 256 -o {output} {input} "
    "&> {log}"
81
82
83
shell:
    "samtools index {input} "
    "&> {log}"  
94
95
shell:
    "bedtools genomecov -bg -split -ibam {input} | sort -k1,1 -k2,2n > {output}"
20
21
script:
    "../scripts/DESeq2_kallisto_tximport.R"
41
42
script:
    "../scripts/volcano_plot.py"
55
56
script:
    "../scripts/GO_bar_charts.py"
69
70
script:
    "../scripts/GO_bar_charts.py"
15
16
17
18
19
20
shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"
29
30
31
32
shell:
    "touch {output.html} && touch {output.zip} && "
    "mv {input.html} {output.html} && "
    "mv {input.zip} {output.zip}"
48
49
50
51
52
53
shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"
69
70
71
72
73
74
shell:
    "fastqc "
    "--outdir {params} "
    "--format fastq "
    "{input} "
    "&> {log}"
15
16
17
18
19
20
21
shell:
    "picard CollectInsertSizeMetrics "
    "--INPUT {input} "
    "--OUTPUT {output.txt} "
    "--Histogram_FILE {output.pdf} "
    "{params} "
    "&> {log}"
11
12
shell:
    "gffread {input.gtf} -g {input.genome} -w {output}"
27
28
29
30
31
32
shell:
    "kallisto index "
    "--index={output} "
    "--kmer-size={params} "
    "{input} "
    "&> {log}"
52
53
54
55
56
57
58
59
60
shell:
    "kallisto quant "
    "--bias "
    "--index={input.idx} "
    "--output-dir={params.outdir} "
    "--bootstrap-samples={params.bootstrap} "
    "--threads={threads} "
    "{input.fq1} {input.fq2} "
    "&> {log}"
71
72
script:
    "../scripts/tx2gene.py"
83
84
shell:
    "grep \'protein_coding\' {input} > {output}"
 99
100
script:
    "../scripts/merge_kallisto_quant.py"
116
117
script:
    "../scripts/pca.py"
134
135
136
137
shell:
    "multiqc {params.scan_dir} "
    "--outdir {params.outdir} "
    "&> {log}"
17
18
19
20
shell:
    "source {params.majiq} && "
    "majiq build {input.gff3} -c {params.config} -j 8 -o {params.outdir} "
    "&> {log} && deactivate"
37
38
39
40
41
shell:
    "source {params.majiq_tool} && "
    "majiq deltapsi -grp1 {params.grp1_majiq_files} -grp2 {params.grp2_majiq_files} "
    "-j 8 -o {params.outdir} --name {params.group} "
    "&> {log} && deactivate"
56
57
58
59
60
shell:
    "source {params.majiq_tool} && "
    "voila modulize {input.splicegraph} {input.voila} "
    "-d {params.outdir} -j 8 --overwrite --decomplexify-deltapsi-threshold 0.05 "
    "&> {log} && deactivate"
81
82
script:
    "../scripts/voila_figures.py"
102
103
104
105
shell:
    "rmats.py --b1 {input.group1} --b2 {input.group2} "
    "--gtf {input.gtf} -t paired --readLength {params.readlength} "
    "--nthread 4 --od {output.outdir} --tmp {output.tmpdir}"
125
126
script:
    "../scripts/filter_rmats.py"
21
22
23
24
25
26
27
28
29
shell:
    "trimmomatic PE "
    "-threads {threads} "
    "{params.extra} "
    "{input.r1} {input.r2} "
    "{output.r1} {output.unpaired_r1} "
    "{output.r2} {output.unpaired_r2} "
    "{params.trimmer} "
    "&> {log}"
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
library(readr)
library(tximport)
library(DESeq2)

# Variables coming from the snakemake
kallisto_dir <- snakemake@params[["kallisto_dir"]]
output_dir <- snakemake@output[["results"]]
design_file <- snakemake@input[["samples"]]
comparison_file <- snakemake@input[["comparisons"]]
transcript_gene_file <- snakemake@input[["gene_id"]]

# create the directory that will contain the results
dir.create(output_dir, showWarnings=FALSE)

#############################################################
#------------- Importing data and information ---------------
#############################################################
# Loading samples information from the design file.
sampleTable <- read.table(
    design_file, header=TRUE,
    row.names="sample", check.names=FALSE
)
conditions <- unique(sampleTable$condition)
samples <- rownames(sampleTable)
sampleTable$condition <- factor(sampleTable$condition, levels=conditions)

# Loading the comparisons
comparisons <- read.table(
    comparison_file, header=TRUE
)

# Read the transcript-gene matrix
tx2gene <- read_tsv(transcript_gene_file, col_names=c('TXNAME', 'GENEID'))

# Get the h5 files for all conditions
files <- file.path(kallisto_dir, samples, "abundance.h5")
names(files) <- samples
txi.kallisto <- tximport(files, type="kallisto", tx2gene=tx2gene) # for gene differential analysis
# txi.kallisto <- tximport(files, type="kallisto", txOut=TRUE) # for transcript differential analysis


#############################################################
#--------------- Creating the DESeq2 object -----------------
#############################################################
# Create the DESeq2 object with all the conditions
dds <- DESeqDataSetFromTximport(
    txi.kallisto,
    sampleTable,
    ~condition
)

# pre filtering the count
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

# put the levels for each columns
dds$condition <- factor(dds$condition, conditions)

# call the function for differential gene expression analysis
dds <- DESeq(dds)


#############################################################
#-------------- Looping over all comparisons ----------------
#############################################################
for (row in 1:nrow(comparisons)) {

    condition1 <- comparisons[row, "cdn1"]
    condition2  <- comparisons[row, "cdn2"]
    exp <- sprintf("%s-%s", condition1, condition2)

    res <- results(
        dds,
        contrast = c(
            "condition",
            condition2,
            condition1
        ),
        #pAdjustMethod = "fdr"
    )

    # transform the result to data frame
    res_df <- as.data.frame(res)

    # Writing results to file
    fname <- paste(
        output_dir,
        paste(exp, "csv", sep='.'),
        sep='/'
    )

    write.csv(
        res_df,
        file=fname,
        quote=FALSE,
    )
}
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
from pybedtools import BedTool
import os


# Access snakemake inputs, outputs and params
raw_dir = snakemake.params.indir # splicing event dir
out_dir = snakemake.params.outdir
tpm = snakemake.params.tpm
sno_file = snakemake.input.sno
eftud2_file = snakemake.input.eftud2
prpf8_file = snakemake.input.prpf8

SPLICING_EVENTS = ['SE.MATS.JC.txt','A5SS.MATS.JC.txt','A3SS.MATS.JC.txt','MXE.MATS.JC.txt']


def filter_by_threshold(df):
    # Filter rMATS output by FDR and IncLevelDifference
    filtered_df = df[df['FDR']<0.05]
    filtered_df = filtered_df[filtered_df['IncLevelDifference'].abs()>0.20]
    return filtered_df

def filter_by_tpm(splicing_df,tpm_df):
    # Keep genes that are expressed in at least one condition (NC5 & SNORD22 KD in kallisto TPM matrix)
    expressed_df = pd.DataFrame(columns=splicing_df.columns)
    # List of unique spliced genes
    uniq = set(splicing_df['GeneID'].values.tolist())
    for gene in uniq:
        gene_tpm_df = tpm_df[tpm_df['gene']==gene]
        if len(gene_tpm_df)>0: # TPM >=1 in at least one sample
            gene_splicing_df = splicing_df[splicing_df['GeneID']==gene]
            expressed_df = pd.concat([expressed_df,gene_splicing_df],ignore_index=True)
    return expressed_df

def parse_splicing(df,type):
    # Transform splicing df to bed
    df['score']=3 # arbitary score for bed format
    if type == 'SE' or type == 'MXE':
        df = df[['chr','upstreamES','downstreamEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
        df.rename(columns={"upstreamES": "start", "downstreamEE": "end"},inplace=True)
    else: # A3SS or A5SS
        pos_df = df[df['strand']=='+']
        neg_df = df[df['strand']=='-']
        if type == 'A3SS':
            pos_df = pos_df[['chr','flankingES','shortEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            pos_df.rename(columns={"flankingES": "start", "shortEE": "end"},inplace=True)
            neg_df = neg_df[['chr','longExonStart_0base','flankingEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            neg_df.rename(columns={"longExonStart_0base": "start", "flankingEE": "end"},inplace=True)
        else: # A5SS
            pos_df = pos_df[['chr','longExonStart_0base','flankingEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            pos_df.rename(columns={"longExonStart_0base": "start", "flankingEE": "end"},inplace=True)
            neg_df = neg_df[['chr','flankingES','shortEE','geneSymbol','score','strand','GeneID','FDR','IncLevelDifference']]
            neg_df.rename(columns={"flankingES": "start", "shortEE": "end"},inplace=True)
        df = pd.concat([pos_df,neg_df],ignore_index=True)
    return df

def binding_ovlp(df1,df2):
    # bedtools intersect between binding interactions
    bed1 = BedTool.from_dataframe(df1)
    bed2 = BedTool.from_dataframe(df2)
    intersection_df = bed1.intersect(bed2,s=True).to_dataframe()
    return intersection_df

def spliced_and_bound(splicing_df,event_type,binding_df):
    # bedtools intersect splicing events with binding interactions
    if 'start' not in splicing_df.columns:
        splicing_df = parse_splicing(splicing_df,event_type)
    splicing_bed = BedTool.from_dataframe(splicing_df)
    binding_bed = BedTool.from_dataframe(binding_df)
    intersection_df = splicing_bed.intersect(binding_bed, s=True,u=True).to_dataframe()
    if len(intersection_df)>0:
        intersection_df.columns = splicing_df.columns
        intersection_df = intersection_df.sort_values(by=['IncLevelDifference','FDR'],key=abs,ascending=[False,True]).drop_duplicates(subset=['start','end'],keep='last')
    return intersection_df

def get_full_coordinates(filtered_df,org_df):
    # Get full information on filtered splicing events
    final_df = pd.DataFrame(columns=org_df.columns)
    for i in range(len(filtered_df)):
        gene = filtered_df.iloc[i]['geneSymbol']
        fdr = filtered_df.iloc[i]['FDR']
        lvldiff = filtered_df.iloc[i]['IncLevelDifference']
        row = org_df[(org_df['geneSymbol']==gene)&(org_df['FDR']==fdr)&(org_df['IncLevelDifference']==lvldiff)]
        final_df = pd.concat([final_df,row],ignore_index=True)
    final_df.drop('score', axis=1,inplace=True)
    return final_df

# Kallisto TPM matrix
tpm_df = pd.read_csv(tpm,sep='\t')
# snoRNA binding interactions
sno_df = pd.read_csv(sno_file,sep='\t')
# RBP binding interactions
eftud2_df = pd.read_csv(eftud2_file,sep='\t')
prpf8_df = pd.read_csv(prpf8_file,sep='\t')

# Overlap of binding interactions
sno_eftud2_df = binding_ovlp(sno_df,eftud2_df)
sno_prpf8_df = binding_ovlp(sno_df,prpf8_df)
sno_eftud2_prpf8_df = binding_ovlp(sno_eftud2_df,prpf8_df)

# For each splicing event type, FILTER and find events bound by snoRNA and RBPs
for event in SPLICING_EVENTS:
    print(event)
    file = os.path.join(raw_dir, event)
    event_type = event.split(".")[0]
    df = pd.read_csv(file, sep='\t')
    filtered = filter_by_threshold(df)
    filtered = filter_by_tpm(filtered,tpm_df)

    # Bound by snoRNA
    print('Bound by SNORD22')
    sno_bound = spliced_and_bound(filtered,event_type,sno_df)
    get_full_coordinates(sno_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22.tsv'),sep='\t',index=None)
    # Bound by snoRNA and EFTUD2
    print('Bound by SNORD22 and EFTUD2')
    sno_eftud2_bound = spliced_and_bound(filtered,event_type,sno_eftud2_df)
    get_full_coordinates(sno_eftud2_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_EFTUD2.tsv'),sep='\t',index=None)
    # Bound by snoRNA and PRPF8
    print('Bound by SNORD22 and PRPF8')
    sno_prpf8_bound = spliced_and_bound(filtered,event_type,sno_prpf8_df)
    get_full_coordinates(sno_prpf8_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_PRPF8.tsv'),sep='\t',index=None)
    # Bound by snoRNA, EFTUD2 and PRPF8
    print('Bound by SNORD22, EFTUD2 and PRPF8')
    sno_eftud2_prpf8_bound = spliced_and_bound(filtered,event_type,sno_eftud2_prpf8_df)
    get_full_coordinates(sno_eftud2_prpf8_bound,filtered).to_csv(os.path.join(out_dir,event_type+'_SNORD22_EFTUD2_PRPF8.tsv'),sep='\t',index=None)
    print('Done')
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gprofiler import GProfiler

# Get list of genes
genes = list(pd.read_csv(snakemake.input.genes, sep='\t')['gene'])

# Run gprofiler for enrichment analysis of functional (GO and other) terms
### NEED INTERNET CONNECTION!
gp = GProfiler(return_dataframe=True)
go_results = gp.profile(organism='hsapiens', query=genes)

# Log-transform p-values
go_results['negative_log10_of_p_value'] = np.negative(np.log10(go_results['p_value']))

# Extract GO terms of interest
go_mf = go_results[go_results['source']=='GO:MF'].sort_values(by=['negative_log10_of_p_value'])
go_cc = go_results[go_results['source']=='GO:CC'].sort_values(by=['negative_log10_of_p_value'])
go_bp = go_results[go_results['source']=='GO:BP'].sort_values(by=['negative_log10_of_p_value'])

go_terms = {"GO:BP":go_bp,"GO:MF":go_mf,"GO:CC":go_cc}
colours = {"GO:BP":"indianred","GO:MF":"limegreen","GO:CC":"steelblue"}

go_concat = pd.DataFrame()
colours_present = {}

# Only keep GO term types that contain at least one GO term of that type
for term in go_terms.keys():
    if len(go_terms[term]) > 0:
        go_concat = pd.concat([go_concat,go_terms[term]], ignore_index=True)
        colours_present.update({term:colours[term]})

# Represent GO results with a bar chart
plt.figure(figsize=(9,4))

bars = pd.DataFrame({'source':go_concat.source.values.tolist(),'p_val':go_concat.negative_log10_of_p_value.values.tolist()},
    index=go_concat.name.values.tolist())

bars['p_val'].plot(kind="barh",color=bars['source'].replace(colours_present),width=0.7)
plt.title('GO Analysis of genes')
plt.xlabel('$-log_{10}$(p-value)')

labels = list(colours_present.keys())
handles = [plt.Rectangle((0,0),1,1, color=colours_present[label]) for label in labels]
plt.legend(handles,labels,loc='lower right')

plt.tight_layout()
plt.savefig(snakemake.output.bar_chart)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
from gtfparse import read_gtf
import os

tx2gene = snakemake.input.tx2gene
gtf = snakemake.input.gtf
outfile = snakemake.output.tpm

final_df = pd.DataFrame()
cycle = 1
sample_list = []

for q in snakemake.input.quant:

    data = pd.read_csv(q, sep='\t', usecols=['target_id','tpm'])

    # get sample name
    sample = os.path.basename(os.path.dirname(q))

    # reformat dataframe
    data.set_index('target_id', inplace=True)
    data.rename(columns={"tpm":sample}, inplace=True)

    # Merge dataframes
    if cycle == 1:
        final_df = data
    else:
        final_df = pd.merge(final_df, data, left_index=True, right_index=True)

    sample_list += [sample]
    cycle+=1

# transcript ID --> gene ID
ids = pd.read_csv(tx2gene, sep='\t', names=['transcript','gene'])
ids.set_index('transcript', inplace=True, drop=False)

final_df = pd.merge(final_df, ids, left_index=True, right_index=True)
final_df.set_index('gene', inplace=True)

# Filter genes by TPM
# TPM >=1 in at least one sample
filtered = final_df[~((final_df[sample_list]<1).all(axis=1))]

# Filter genes by biotype
# only keep protein coding genes
df_gtf = read_gtf(gtf)
filtered_pc = filtered[filtered.index.isin(df_gtf.gene_id)]

# Add gene name
id_name = df_gtf[['gene_id','gene_name']].drop_duplicates(ignore_index=True)

index = filtered_pc.index.tolist()
names =[]

for i in range(len(index)):
    id = index[i]
    name = id_name[id_name['gene_id']==id].iloc[0]['gene_name']
    names.append(name)

print(names)
filtered_pc['gene_name'] = names
cols = filtered_pc.columns.tolist()
cols = cols[-1:] + cols[:-1]
cols = cols[-1:] + cols[:-1]
filtered_pc = filtered_pc[cols]

# Write to file
filtered_pc.to_csv(outfile, sep='\t')
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import re

df = pd.read_csv(snakemake.input.tpm, sep='\t', index_col='gene')
df = df.drop(columns=['transcript', 'gene_name'])
df = df.T

# Standardize the values (remove mean and divide by stdev)
X = StandardScaler().fit_transform(df)

# Initialize pca
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
principal_df = pd.DataFrame(data = principal_components, columns = ['PC1', 'PC2'])
principal_df['sample'] = df.index

# Modify labels for legend
def legend_text(tuples):
    labels = []
    for t in tuples:
        if 'NC' in t[0]:
            labels.append(t[0])
        else:
            snoKD = 'SNORD'+t[0]+'_ASO'+str(t[1])
            labels.append(snoKD)
    return labels

# Add condition and sample information to the PCA dataframe
design = pd.read_csv(snakemake.params.design, sep='\s+')
tup = design[['condition','ASO']].apply(tuple, axis=1)
principal_df['label'] = legend_text(tup)

var1, var2 = round(pca.explained_variance_ratio_[0], 4) * 100, round(pca.explained_variance_ratio_[1], 4) * 100

# Create color palette for the samples
def color_palette(labels):
    palette = []
    for l in labels:
        if 'NC' in l and 'dimgray' not in palette:
            palette.append('dimgray')
        elif 'ASO1' in l and 'seagreen' not in palette: # blue, seagreen
            palette.append('seagreen')
        elif 'ASO2' in l and 'mediumseagreen' not in palette: # royalblue, mediumseagreen
            palette.append('mediumseagreen')
        else:
            continue
    return palette

# Create pca_plot function
def pca_plot(df, x_col, y_col, hue_col, xlabel, ylabel, title, path, **kwargs):

    # Creates a PCA (scatter) plot (using a x, y and hue column).

    #sns.set_theme()
    plt.figure(figsize=(5.5,4))
    plt.rcParams['svg.fonttype'] = 'none'
    plt.rcParams["legend.loc"] = 'upper right'

    plt.suptitle(title, fontsize=16)
    sns.scatterplot(data=df, x=x_col, y=y_col, hue=hue_col, palette=color_palette(df[hue_col]), edgecolor='face',
                    alpha=0.7, s=50, **kwargs)

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    plt.legend(fontsize='medium')

    plt.savefig(path, bbox_inches='tight', dpi=600)

# Create PCA scatter plot
pca_plot(principal_df, 'PC1', 'PC2', 'label', f'PC1 ({var1:.2f}%)', f'PC2 ({var2:.2f}%)',
        'PCA plot based on scaled TPM', snakemake.output.plot)

principal_df.to_csv(snakemake.output.tsv, sep='\t', index=False)
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
import re

gtf_file = snakemake.input.gtf
out_file = snakemake.output.tsv

def transcript2gene(gtf_file):
    # Creating a transcript -> gene dictionary
    tr_dict = dict()
    with open(str(gtf_file)) as f:
        for line in f:
            trans_id = ''
            gene_id = ''

            if 'gene_id' in line:
                gene_id = re.search(
                    r'gene_id "(.*?)"[;,]', line
                ).group(1)
                gene_id = gene_id.split(',')[0]

            if 'transcript_id' in line:
                trans_id = re.search(
                    r'transcript_id "(.*?)"[;]', line
                ).group(1)

            # Insert in tr dict
            if trans_id and gene_id and trans_id not in tr_dict:
                tr_dict[trans_id] = gene_id

            gene_id = ''
            trans_id = ''
    return tr_dict

# Generate the dictionary
_dict = transcript2gene(gtf_file)

with open(out_file, 'w') as f:
    for key, value in _dict.items():
        f.write(str(key) + '\t' + value + '\n')
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pybedtools import BedTool
import os


# Access snakemake inputs, outputs and params
raw_dir = snakemake.params.indir
out_dir = snakemake.params.outdir
exp_genes = snakemake.input.exp_genes
DE_up = snakemake.input.DE_up
DE_down = snakemake.input.DE_down
outfile = snakemake.output.bar_chart
sno = snakemake.params.sno[0]
sno_interactions_dir = snakemake.params.sno_interactions

SPLICING_EVENTS = ['alt3and5prime','alt3prime','alt5prime','alternate_first_exon','alternate_last_exon','alternative_intron',
                    'cassette','multi_exon_spanning','mutually_exclusive','p_alt3prime','p_alt5prime',
                    'p_alternate_first_exon','p_alternate_last_exon','tandem_cassette']


def filter_genes(raw_dir,out_dir,exp_genes):
    # Group by event_id and gene_id
    event_id_count_dic = {}
    gene_id_count_dic = {}
    gene_lists = {}

    for filename in os.listdir(raw_dir):
        f = os.path.join(raw_dir, filename)
        # checking if it is a file
        if os.path.isfile(f) and ".tsv" in filename:

            voila_df = pd.read_csv(f, sep='\t', comment='#')

            if len(voila_df) > 0:
                genes = pd.read_csv(exp_genes, sep='\t', usecols=['gene'])
                outfile = os.path.join(out_dir, filename)

                # Drop genes that are not in the TPM filtered list
                voila_filtered = voila_df[voila_df.gene_id.isin(genes.gene)] 

                # Write to file
                voila_filtered.to_csv(outfile, sep='\t', index=False)

                if filename not in ['heatmap.tsv','junctions.tsv','summary.tsv','other.tsv']:
                    event = filename.split('.')[0]
                    event_id_count_dic[event] = len(pd.unique(voila_filtered['event_id']))
                    unique_genes = pd.unique(voila_filtered['gene_id'])
                    gene_id_count_dic[event] = len(unique_genes)
                    gene_lists[event] = list(unique_genes)

    event_id_count_df = pd.DataFrame({'Event': event_id_count_dic.keys() , 'Count': list(event_id_count_dic.values())})
    event_id_count_df = event_id_count_df.sort_values(by=['Count'], ascending=True)
    gene_id_count_df = pd.DataFrame({'Event': gene_id_count_dic.keys() , 'Count': list(gene_id_count_dic.values())})
    gene_id_count_df = gene_id_count_df.sort_values(by=['Count'], ascending=True)

    return event_id_count_df, gene_id_count_df, gene_lists

def splicing_and_DE(spliced_genes, DE_genes):

    common_count = {}

    # List of genes that are DE
    DE_df = pd.read_csv(DE_genes, sep='\t', usecols=['gene'])

    # For each splicing event type, find genes that are also DE
    for e in list(spliced_genes.keys()):
        common_count[e] = len(set(spliced_genes[e]) & set(DE_df['gene'].values.tolist()))

    df_common = pd.DataFrame({'Event': common_count.keys() , 'Count': list(common_count.values())})
    df_common = df_common.sort_values(by=['Count'], ascending=True)

    return df_common

def parse_sno(sno_dir,sno_id):

    # Prep sno interactions file for bedtools intersect
    sno_file = os.path.join(sno_dir, sno_id + '.bed')
    sno_df = pd.read_csv(sno_file, sep='\t', 
        names=['seqname', 'target_window_start', 'target_window_end', 'snoid', 'count', 'strand',
            'sno_window_start', 'sno_window_end', 'mean_score', 'min_score', 'max_score', 'target'])

    # Drop unnecessary columns for bedtools intersect
    sno_df.drop(columns=['snoid', 'sno_window_start', 'sno_window_end', 'mean_score', 'min_score', 'max_score'], inplace=True)
    sno_df = sno_df[['seqname', 'target_window_start', 'target_window_end', 'target', 'count', 'strand']]

    # Add 'chr' to seqname if not already present
    sno_df = sno_df.astype({'seqname':'string'})
    if 'chr' not in sno_df.iloc[0]['seqname']:
        sno_df['seqname'] = 'chr' + sno_df['seqname']

    return sno_df

def parse_voila(df):

    # Create a dictionary of lists
    # KEY: event_id  VALUE: list of start and end positions of exons
    pos_dic = {}

    for i in range(len(df)):

        event_id = df.iloc[i]['event_id']
        for t in ['reference_exon_coord','spliced_with_coord','junction_coord']:
            if isinstance(df.iloc[i][t], str):
                coord = df.iloc[i][t].split("-") # list
                for el in coord:
                    if el in ['na','']:
                        coord.remove(el)
                    else:
                        el = int(el) # convert strings to ints
                if '1' in coord:
                    coord.remove('1')
                if event_id in pos_dic:
                    pos_dic[event_id] = pos_dic[event_id] + coord
                else:
                    pos_dic[event_id] = coord

    # Reformat data to produce a bed file
    bed_df = pd.DataFrame(columns=['seqid', 'start', 'end', 'gene_name', 'strand'])

    info_df = df.drop_duplicates(subset=['event_id']) # df to get info from

    for e in pos_dic.keys():

        # Grep info
        event_info = info_df[info_df['event_id'] == e]
        gene_name = event_info.iloc[0]['gene_name']
        seq_id = event_info.iloc[0]['seqid']
        strand = event_info.iloc[0]['strand']
        new_event = pd.DataFrame({"gene_name":gene_name,'seqid':seq_id,'strand':strand,'start':[min(pos_dic[e])],'end':[max(pos_dic[e])]})
        bed_df = pd.concat([bed_df, new_event])

    bed_df['count']=3 # arbitrary value to meet bed format
    bed_df = bed_df[['seqid', 'start', 'end', 'gene_name', 'count', 'strand']] # reorder columns

    # Add 'chr' to seqid if not already present
    bed_df = bed_df.astype({'seqid':'string'})
    if 'chr' not in bed_df.iloc[0]['seqid']:
        bed_df['seqid'] = 'chr' + bed_df['seqid']

    return bed_df

def splicing_and_sno_binding(sno_interactions_dir, sno, splicing_dir):

    common_count_event_id = {}
    common_count_gene = {}

    sno_df = parse_sno(sno_interactions_dir, sno) # sno interaction file

    # For each splicing event type, find splicing events/genes that are also bound by the snoRNA
    print('Splicing events bound by snoRNA')
    for filename in os.listdir(splicing_dir):
        if '.tsv' in filename and filename.split(".")[0] in SPLICING_EVENTS:

            f = os.path.join(splicing_dir, filename)
            df = pd.read_csv(f, sep='\t', usecols=['gene_id','gene_name','seqid','strand','event_id',
                                                    'reference_exon_coord','spliced_with_coord','junction_coord'])

            if len(df) > 0:

                splicing_df = parse_voila(df)

                splicing_bed = BedTool.from_dataframe(splicing_df)
                sno_bed = BedTool.from_dataframe(sno_df)

                df_intersect = splicing_bed.intersect(sno_bed, s=True).to_dataframe()
                if len(df_intersect) > 0:
                    # drop duplicate rows
                    df_intersect.drop_duplicates(inplace=True)
                    pd.set_option('display.max_rows', None) # no limit for displaying df rows
                    print(filename)
                    print(df_intersect)
                    common_count_event_id[filename.split(".")[0]] = len(df_intersect)
                    df_intersect.columns = ['seqid','start','end','gene_name','count','strand']
                    df_intersect_by_gene = df_intersect.drop_duplicates(subset=['gene_name'])
                    common_count_gene[filename.split(".")[0]] = len(df_intersect_by_gene)
                else:
                    common_count_event_id[filename.split(".")[0]] = 0
                    common_count_gene[filename.split(".")[0]] = 0

            else:
                common_count_event_id[filename.split(".")[0]] = 0
                common_count_gene[filename.split(".")[0]] = 0

    df_common_event_id = pd.DataFrame({'Event': common_count_event_id.keys() , 'Count': list(common_count_event_id.values())})
    df_common_event_id = df_common_event_id.sort_values(by=['Count'], ascending=True)
    df_common_gene = pd.DataFrame({'Event': common_count_gene.keys() , 'Count': list(common_count_gene.values())})
    df_common_gene = df_common_gene.sort_values(by=['Count'], ascending=True)

    return df_common_event_id, df_common_gene

# Create various horizontal bar charts
def create_figures(event_df, gene_df, gene_lst, DE_up_file, DE_down_file, outfile, sno, sno_interactions_dir, splicing_dir):

    fig, axs = plt.subplots(3, 2, figsize=(10,9))

    # Number of splicing events by event_id
    axs[0,0].barh(event_df['Event'],event_df['Count'], color='dimgray')
    axs[0,0].set_title('Splicing events',size=14)
    axs[0,0].set(xlabel="Number of events")

    # Number of genes per splicing event
    axs[0,1].barh(gene_df['Event'],gene_df['Count'], color='dimgray')
    axs[0,1].set_title('Spliced genes',size=14)
    axs[0,1].set(xlabel="Number of genes")

    # Number of genes that are spliced and DE (downregulated) 
    df_spliced_DE_down = splicing_and_DE(gene_lst, DE_down_file)
    axs[1,0].barh(df_spliced_DE_down['Event'],df_spliced_DE_down['Count'], color='dimgray')
    axs[1,0].set_title('Spliced and downregulated genes',size=14)
    axs[1,0].set(xlabel="Number of genes")

    # Number of genes that are spliced and DE (upregulated)
    df_spliced_DE_up = splicing_and_DE(gene_lst, DE_up_file)
    axs[1,1].barh(df_spliced_DE_up['Event'],df_spliced_DE_up['Count'], color='dimgray')
    axs[1,1].set_title('Spliced and upregulated genes',size=14)
    axs[1,1].set(xlabel="Number of genes")

    # Number of splicing events/genes that are bound by sno
    df_common_event_id, df_common_gene = splicing_and_sno_binding(sno_interactions_dir, sno, splicing_dir)
    axs[2,0].barh(df_common_event_id['Event'],df_common_event_id['Count'], color='dimgray')
    axs[2,0].set_title('Splicing events bound by snoRNA',size=14)
    axs[2,0].set(xlabel="Number of overlaps")
    axs[2,1].barh(df_common_gene['Event'],df_common_gene['Count'], color='dimgray')
    axs[2,1].set_title('Spliced genes bound by snoRNA',size=14)
    axs[2,1].set(xlabel="Number of genes")

    fig.suptitle(sno+' KD',fontsize=16)

    plt.tight_layout()
    plt.savefig(outfile)

    return


# Call functions
event_id_count_df, gene_id_count_df, gene_lists = filter_genes(raw_dir,out_dir,exp_genes)
create_figures(event_id_count_df, gene_id_count_df, gene_lists, DE_up, DE_down, outfile, sno, sno_interactions_dir, out_dir)
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from gtfparse import read_gtf
import numpy as np

# Get list of genes after TPM filtering
genes_file = snakemake.input.filtered_genes
genes = pd.read_csv(genes_file, sep='\t', usecols=['gene'])

# Configure comparison, pval_threshold and colors
comp1, comp2 = str(snakemake.wildcards.comp).split('-')

if 'NC' not in comp1:
        comp1 = 'SNORD'+comp1+' KD'
if 'NC' not in comp2:
        comp2 = 'SNORD'+comp2+' KD'

#DE_tool, quantifier = str(snakemake.wildcards.DE_tool), str(snakemake.wildcards.quant)
pval_threshold = snakemake.params.pval_threshold
#colors = {'|log2FC| > 1 & padj < '+str(pval_threshold): 'blue','n.s.': 'grey'}
colors = {'log2FC < -1 & padj < '+str(pval_threshold): 'cornflowerblue','log2FC > 1 & padj < '+str(pval_threshold): 'firebrick','n.s.': 'grey'}

# Load DE df
df = pd.read_csv(snakemake.input.DE_output)

# Rename columns
df.set_axis(['gene','baseMean','log2FoldChange','lfcSE','stat','pvalue','padj'], axis=1, inplace=True)

# Drop genes/transcripts with NaN in log2FoldChange, pvalue and/or padj
df = df.dropna(subset=['log2FoldChange', 'pvalue', 'padj'])

# Drop genes that are not in the TPM filtered list
df = df[df.gene.isin(genes.gene)] 

# Filter genes by biotype
# only keep protein coding genes
gtf = snakemake.input.gtf
df_gtf = read_gtf(gtf)
df = df[df.gene.isin(df_gtf.gene_id)]

# Create -log10padj column
df['-log10padj'] = -np.log10(df['padj'])

# Create hue column for significative points (|log2FC| > 1 & padj<0.05)
df.loc[(df['padj'] < pval_threshold) & (df['log2FoldChange'] > 1),
        'sig.'] = 'log2FC > 1 & padj < '+str(pval_threshold)
df.loc[(df['padj'] < pval_threshold) & (df['log2FoldChange'] < -1),
        'sig.'] = 'log2FC < -1 & padj < '+str(pval_threshold)
df['sig.'] = df['sig.'].fillna('n.s.')

# Extract genes that are significant
outfile_up = snakemake.output.up_genes
outfile_down = snakemake.output.down_genes
df_genes = df[~df['sig.'].str.contains('n.s.')]
df_genes = df_genes[['gene','log2FoldChange','padj']]

up = df_genes[df_genes['log2FoldChange']>0]
up.sort_values('log2FoldChange',inplace=True,ascending=False)
down = df_genes[df_genes['log2FoldChange']<0]
down.sort_values('log2FoldChange',inplace=True,ascending=False)
up.to_csv(outfile_up, sep='\t', index=False)
down.to_csv(outfile_down, sep='\t', index=False)

# Create volcano function
def volcano(df, x_col, y_col, hue_col, xlabel, ylabel, title, color_dict, path,
            pval_threshold, **kwargs):
    """
    Creates a volcano plot (using a x, y and hue column).
    """
    #sns.set_theme()
    plt.figure(figsize=(5.5,4))
    plt.rcParams['svg.fonttype'] = 'none'

    plt.suptitle(title, fontsize=16)
    sns.scatterplot(data=df, x=x_col, y=y_col, hue=hue_col,
                    palette=color_dict, edgecolor='face',
                    s=25, **kwargs)

    # Add threshold lines (padj)
    plt.axhline(y=-np.log10(pval_threshold), color='black', ls='--', lw=0.5)
    plt.axvline(x=np.log2(2), color='black', ls='--', lw=0.5)
    plt.axvline(x=np.log2(0.5), color='black', ls='--', lw=0.5)

    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel(xlabel, fontsize=14)
    plt.ylabel(ylabel, fontsize=14)
    leg = plt.legend(fontsize="medium",bbox_to_anchor=(1.6, 0.4), loc='lower right', ncol=1,
        facecolor='white',framealpha=1)
    leg.get_frame().set_linewidth(0)
    plt.savefig(path, bbox_inches='tight', dpi=600)

    return

# Create volcano
volcano(df, 'log2FoldChange', '-log10padj', 'sig.',
        'log2(Fold Change)', '-log10(FDR-adjusted p-value)',
        f'{comp1} vs {comp2} using DESeq2',
        colors, snakemake.output.volcano, pval_threshold)
ShowHide 19 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/kristinassong/RNAseq
Name: rnaseq
Version: 2
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...