Hypermutated Tumor Mutational Signature Detection and Visualization Pipeline

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

This pipeline is designed to detect and visualise Mutational Signatures in Hypermutated Tumors

Designed and Edited by : Mohamed Elsayed Youssef

Supervised by : Dr. Philipp Euskirchen

HOW TO USE:

download and install 'Conda, or miniconda' Package manager
use the config files as a parameter for conda to creat the work environments >
- conda env create -f environment.yml
does not work on Login Nodes > use “run” command
- source activate env_name
use

Take care to be connected to a computing node, if on the cluster: srun --time 4-00 --mem=8G --ntasks=8 --pty bash -i

Code Snippets

shell:
   """
   snpEff -Xmx8g -v hg19kg {input} | 
   SnpSift filter "!(ALT = '.') & isVariant(GEN[0])" /dev/stdin > {output}
   """

SnakeMake snpEff SnpSift From line 11 of rules/filter1.smk

shell:
   "snpEff -Xmx8g -v hg19kg {input} | "
   "SnpSift annotate -exists -v gnomad resources/gnomad.exomes.r2.0.2.sites.vcf.gz /dev/stdin | "
   "SnpSift filter -n ' (exists ID) & !( ID = 'gnomad' )' /dev/stdin > {output}"

SnakeMake snpEff SnpSift From line 12 of rules/filter2.smk

shell:"SnpSift filter '(GEN[0].VF > 0.05)'  {input} > {output}"

SnakeMake SnpSift From line 9 of rules/filter3.smk

script:
    'workflow/scripts/pdf_merge.py'

SnakeMake From line 12 of rules/merge_pdfs.smk

shell:
    "python workflow/scripts/plot_vf_distribution.py {input.vcf} {output.png}"

SnakeMake From line 7 of rules/plot_vf.smk

script:
    '/fast/projects/hyperpanel/work/Hyperpanel/workflow/scripts/png_to_pdf_script.py'

SnakeMake From line 6 of rules/png_to_pdf.smk

script:
    '/fast/projects/hyperpanel/work/Hyperpanel/workflow/scripts/png_to_pdf_script.py'

SnakeMake From line 6 of rules/png_to_pdf.smk

import vcf
# The calculated values need to be adjusted depending on the Format of the vcf File
# Read the VCF file
vcf_reader = vcf.Reader(filename="0103T.vcf")

# Create a new VCF writer to write the output
vcf_writer = vcf.Writer(open("0103T_AAF_ann.vcf", "w"), vcf_reader)

# Loop through the variants
for variant in vcf_reader:
    # Check if DP and AO values are present
    if variant.INFO.get("DP") is not None and variant.INFO.get("AO") is not None:
        # Calculate the AAF value
        AAF = int(variant.INFO["AO"]) / int(variant.INFO["DP"])

        # Add the AAF value as a new INFO field
        variant.INFO["AAF"] = AAF

    # Write the variant to the output file
    vcf_writer.write_record(variant)

# Close the VCF writer
vcf_writer.close()

Python PyVCF From line 2 of scripts/AAF_annotation.py

import vcf

# Reading the vcf file
#The calculated values need to be adjusted depending on the Format of the vcf File
vcf_reader = vcf.Reader(filename="0103T.vcf")

# Open a new vcf file to write the filtered variants
with open("0103T_pyvcf_filtered.vcf", "w") as filtered_file:
    # Create a vcf writer
    vcf_writer = vcf.Writer(filtered_file, vcf_reader)

    # Loop through the variants
    for record in vcf_reader:
        # Get the DP and AD values
        DP = record.INFO.get("DP")
        AD = record.INFO.get("AD")

        # Check if the DP and AD values are available
        if DP is not None and AD is not None:
            # Calculate the AAF
            aaf = AD/DP

            # Check if the AAF value is greater than 0.05
            if aaf > 0.05:
                # Write the variant to the filtered vcf file
                vcf_writer.write_record(record)

# Close the vcf writer
vcf_writer.close()

Python PyVCF From line 2 of scripts/AAF_filter.py

import vcf
import matplotlib.pyplot as plt

# Reading the vcf file
#  The calculated values need to be adjusted depending on the Format of the vcf File
vcf_reader = vcf.Reader(filename="input.vcf")

# List to store the AAF values
aaf_values = []

# Loop through the variants
for record in vcf_reader:
    # Get the DP and AO values
    DP = record.INFO.get("DP")
    AO = record.INFO.get("AO")

    # Check if the DP and AO values are available
    if DP is not None and AO is not None:
        # Calculate the AAF
        aaf = AO/DP
        aaf_values.append(aaf)

# Plotting the AAF distribution
plt.hist(aaf_values, bins=50, edgecolor="black")

# Adding labels and title to the plot
plt.xlabel("Allele Frequency")
plt.ylabel("Count")
plt.title("Allele Frequency Distribution")

# Saving the plot as a PNG image
plt.savefig("aaf_distribution.png")

Python matplotlib PyVCF From line 2 of scripts/AAF_plot.py

from PyPDF2 import PdfFileReader, PdfFileWriter
import os

def merge_pdfs(paths, output_filename):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

if __name__ == '__main__':
    # Extract the paths from the snakemake object
    pdfs1 = [snakemake.input.file1, snakemake.input.file2, snakemake.input.file3]
    pdfs2 = [snakemake.input.file4, snakemake.input.file5]
    pdfs3 = [snakemake.input.file6, snakemake.input.file7]

    merge_pdfs(pdfs1, output_filename='temp1.pdf')
    merge_pdfs(pdfs2, output_filename='temp2.pdf')
    merge_pdfs(pdfs3, output_filename='temp3.pdf')

    final_pdfs = ['temp1.pdf', 'temp2.pdf', 'temp3.pdf']
    merge_pdfs(final_pdfs, snakemake.output.merged)

    # Cleanup temporary files
    for pdf in final_pdfs:
        os.remove(pdf)

Python Snakemake From line 1 of scripts/pdf_merge.py

import vcf
import matplotlib.pyplot as plt
import argparse

def plot_af_distribution(input_vcf, output_png):
    # Reading the vcf file
    vcf_reader = vcf.Reader(filename=input_vcf)

    # List to store the AF values
    af_values = []

    # Loop through the records
    for record in vcf_reader:
        if "AF" in record.samples[0].data._fields:
            af = record.samples[0].data.AF
            if af is not None and isinstance(af, (float, int)):
                af_values.append(af)

    # Plotting the AF values
    plt.hist(af_values, bins=50, edgecolor="black")

    # Adding labels and title to the plot
    plt.xlabel("Allele Frequency (AF)")
    plt.ylabel("Count")
    plt.title("Allele Frequency (AF) Distribution")

    # Saving the plot as a PNG image
    plt.savefig(output_png)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Plot AF distribution from VCF")
    parser.add_argument("input_vcf", help="Input VCF file")
    parser.add_argument("output_png", help="Output PNG file for the plot")
    args = parser.parse_args()

    plot_af_distribution(args.input_vcf, args.output_png)

Python matplotlib PyVCF From line 2 of scripts/plot_af_distribution.py

import vcf
import matplotlib.pyplot as plt
import argparse

def plot_vf_distribution(input_vcf, output_png):
    # Reading the vcf file
    vcf_reader = vcf.Reader(filename=input_vcf)

    # List to store the VF values
    vf_values = []

    # Loop through the records
    for record in vcf_reader:
        if "VF" in record.samples[0].data._fields:
            vf = record.samples[0].data.VF
            if vf is not None and isinstance(vf, (float, int)):
                vf_values.append(vf)

    # Plotting the VF values
    plt.hist(vf_values, bins=50, edgecolor="black")

    # Adding labels and title to the plot
    plt.xlabel("Variant Frequency (VF)")
    plt.ylabel("Count")
    plt.title("Variant Frequency (VF) Distribution")

    # Saving the plot as a PNG image
    plt.savefig(output_png)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Plot VF distribution from VCF")
    parser.add_argument("input_vcf", help="Input VCF file")
    parser.add_argument("output_png", help="Output PNG file for the plot")
    args = parser.parse_args()

    plot_vf_distribution(args.input_vcf, args.output_png)

Python matplotlib PyVCF From line 2 of scripts/plot_vf_distribution.py

from PIL import Image

def png_to_pdf(png_path, output_pdf_path):
    # Open the image
    image = Image.open(png_path)

    # Convert to 'RGB' before saving as PDF to avoid potential errors
    img_rgb = image.convert('RGB')

    # Save as a PDF
    img_rgb.save(output_pdf_path)

if __name__ == '__main__':
    png_to_pdf(snakemake.input.png, snakemake.output.pdf)