A snakemake workflow for analysing synonymous and nonsynonymous mutations

public public 1yr ago 0 bookmarks

This simple workflow constructs RSV-A and RSV-B synonymous and nonsynonymous mutation graphs. These graphs are scaled by gene length, tree length and multiplied based on the proportion of loci at which they may occur (3 for synonymous, as they can occur at every third codon, and 3/2 for nonsynonymous).

Running the workflow

This workflow uses Snakemake. To run from the command line, run "snakemake --cores all" from this directory. RSV-A and RSV-B can be specified as "a" or "b" in the Snakefile rule all input.

Workflow inputs

This workflow requires as input in the data folder:

  • a nwk tree file to calculate total tree length

  • a nucleotide mutation file (json format)

  • an amino acid mutation file (json format)

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
from collections import Counter
import pandas as pd
import json
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import argparse
from Bio import Phylo

def GenesAndCodons(aa_muts, nt_muts):
    codons=[]
    all_genes =[] 
    with open(aa_muts) as a_muts:
        with open(nt_muts) as n_muts:
            dictofgenes=dict()
            aamuts = json.load(a_muts)
            ntmuts = json.load(n_muts)
            genes = ('F','G','M','M2','NS1','NS2','P','SH','N') #genes in RSV

            for gene in genes:
                for key, node in aamuts['annotations'].items():
                    if key == gene:
                        location_of_gene=[]
                        location_of_gene = list(range(node['start'],node['end']+1)) #where each gene starts and ends
                dictofgenes[gene]=location_of_gene

            for k, n in aamuts['nodes'].items():
                for key, node in ntmuts['nodes'].items():
                    if k == key:
                        for y in node['muts']:
                            numbers =[]
                            number =int(y[1:-1])
                            numbers.append(number)
                            for pos in numbers:
                                for gene, location_of_gene in dictofgenes.items():
                                    if pos in location_of_gene:
                                        codon = (math.floor((pos-location_of_gene[0])/3))+1        
                                        all_genes.append(gene)
                                        codons.append(codon)
        df=pd.DataFrame({'Gene':all_genes,'Codon':codons})
        return(df)

def MutationsineachGene(aamutations, ntmutations):
    genes =['F', 'G', 'M', 'M2', 'NS1', 'NS2', 'P', 'SH', 'N']
    muts_in_genes = dict()
    muts_in_genes_correct_index=dict()
    df= GenesAndCodons(aamutations, ntmutations)
    for gene in genes:
        muts_in_genes[gene]= df.loc[df['Gene']==gene]

    for gene, muts in muts_in_genes.items():
        muts = muts.reset_index(drop=True)
        muts_in_genes_correct_index[gene]=muts
    return(muts_in_genes_correct_index)


def AA_Mutations(aamutations, ntmutations):
    aa_m = dict()
    with open(aamutations) as f:
        with open(ntmutations) as g:
            genes = ('F','G','M','M2','NS1','NS2','P','SH','N')
            aamuts = json.load(f)
            for gene in genes:
                mut_list=[]
                for k, n in aamuts['nodes'].items():  
                            for i,j in n['aa_muts'].items():
                                if j!=[] and i ==gene:
                                    mut_list.append(j)
                flatlist =[item for sublist in mut_list for item in sublist]
                flatlist = [int(i[1:-1]) for i in flatlist]
                aa_m[gene]=flatlist
    return(aa_m)


def non_synonymous_or_synonymous(aa_muts, nt_muts):
    aa_mutations = AA_Mutations(aa_muts, nt_muts)
    mutations_in_genes = MutationsineachGene(aa_muts, nt_muts)
    synonymousmutations =[]
    nonsynonymousmutations =[]
    ratios=[]
    sel =[]
    listofgenes =('F','G','M','M2','NS1','NS2','P','SH','N')
    for gene in listofgenes:
        all_nonsyn_muts =[]
        all_syn_muts =[]
        for (gene_,mutation), (gene__,aa_mut) in zip(mutations_in_genes.items(), aa_mutations.items()):
            if gene_ == gene and gene__ == gene:
                a =list(mutation['Codon'])
                all_muts = Counter(a)
                amino_acid_muts = Counter(aa_mut)
                synonymous_muts = all_muts-amino_acid_muts


        for genes, j in amino_acid_muts.items():
            all_nonsyn_muts.append(j)
        nonsynonymousmutations.append(sum(all_nonsyn_muts))
        for k, l in synonymous_muts.items():
            all_syn_muts.append(l)
        synonymousmutations.append(sum(all_syn_muts))
        for a, b in zip(nonsynonymousmutations, synonymousmutations):
            ratio = a/b #ratio of nonsynonymous to synonymous mutations
            if ratio>1:selection =('adaptive')
            elif ratio<1: selection = ('purifying')
            elif ratio ==1: selection =('neutral')
        sel.append(selection)
        ratios.append(ratio)

    df = pd.DataFrame({"gene":listofgenes, "synonymous mutations": synonymousmutations, "nonsynonymous mutations":nonsynonymousmutations, "dN/dS ratio":ratios, "selection":sel })
    return(df)

if __name__=="__main__":
    parser = argparse.ArgumentParser(
        description="analyse synonymous and nonsynonymous",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--aa', required=True, type=str, help="aa json file")
    parser.add_argument('--nt', type=str, help="nt json file")
    parser.add_argument('--output', type=str,  help="output graph png")
    parser.add_argument('--outputnonsyn', type=str, help="output file for nonsynonymous mutations")
    parser.add_argument('--table', type=str,  help="output table csv")
    parser.add_argument('--tree', type=str,  help="input tree nwk")
    args = parser.parse_args()

    """ratio of nonsynonymous mutations in G to nonsynonymous in F is higher than synonymous G to synonymous F"""
    df1 = non_synonymous_or_synonymous(args.aa, args.nt)

    tree_file  = Phylo.read(args.tree, "newick")
    tree_file.root_at_midpoint()
    tree_file.find_clades()
    total_len = tree_file.total_branch_length()

    with open(args.aa) as f:
        gene_length=[]
        dictofgenes=dict()
        aamuts = json.load(f)
        keys = ('F','G','M','M2','NS1','NS2','P','SH','N')

        for gene in keys:
            for key, node in aamuts['annotations'].items():
                if key == gene:
                    loc_list=[]
                    loc_list = list(range(node['start'],node['end']+1))
            dictofgenes[gene]=loc_list
        for gene, loc in dictofgenes.items():
            gene_length.append(len(loc))
        df1['length of gene'] =gene_length
        df1['synonymous mutation/gene'] = ((df1['synonymous mutations']/df1['length of gene'])/total_len)*3   #multiplied by 3 because every 3rd is an option
        df1['nonsynonymous mutation/gene']=((df1['nonsynonymous mutations']/df1['length of gene'])/total_len) *(3/2) #multiplied by 2/3 as 1 and 2 are an option

    plt.figure(figsize=(8,6))
    gene_names = df1['gene'].to_list()
    gene_name = df1['gene']
    colors_ = np.array(["green","blue","yellow","pink","black","orange","gray","cyan","magenta"])
    scatter = plt.scatter(df1['length of gene'], 
                df1['synonymous mutation/gene'],
                s=150, c=colors_)
    for i in range(0, len(df1['length of gene'])):
        plt.text(df1['length of gene'][i] - 10, df1['synonymous mutation/gene'][i], f'{gene_name[i]}')
    plt.xlabel("Gene Length", size=20)
    plt.ylabel("synonymous mutation rate", size=20)
    plt.legend(handles=scatter.legend_elements()[0], 
            labels=gene_names,
            title="gene")
    plt.title("Number of Synonymous Mutations in Each Gene")
    plt.savefig(args.output)

    csv_file = non_synonymous_or_synonymous(args.aa, args.nt)
    csv_file.to_csv(args.table)

    plt.figure(figsize=(8,6))
    scatter_1 = plt.scatter(df1['length of gene'], 
                df1['nonsynonymous mutation/gene'],
                s=150, c=colors_)
    for i in range(0, len(df1['length of gene'])):
        plt.text(df1['length of gene'][i] - 10, df1['nonsynonymous mutation/gene'][i], f'{gene_name[i]}')
    plt.xlabel("Gene Length", size=20)
    plt.ylabel("nonsynonymous mutation rate", size=20)
    plt.legend(handles=scatter_1.legend_elements()[0], 
            labels=gene_names,
            title="gene")
    plt.title("Number of Nonsynonymous Mutations in Each Gene")
    plt.savefig(args.outputnonsyn)
16
17
18
19
20
21
22
23
24
25
shell:
    """
    python3 scripts/graphs.py \
    --aa {input.aa} \
    --nt {input.nt} \
    --output {output.graph} \
    --outputnonsyn {output.graphnonsyn} \
    --table {output.table} \
    --tree {input.tree}
    """
ShowHide 1 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/LauraU123/synonymous_nonysynonymous
Name: synonymous_nonysynonymous
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...