Hypermutated Tumor Mutational Signature Detection and Visualization Pipeline

public public 1yr ago 0 bookmarks

This pipeline is designed to detect and visualise Mutational Signatures in Hypermutated Tumors

Designed and Edited by : Mohamed Elsayed Youssef

Supervised by : Dr. Philipp Euskirchen

HOW TO USE:

  1. download and install 'Conda, or miniconda' Package manager

  2. use the config files as a parameter for conda to creat the work environments >

    • conda env create -f environment.yml

    does not work on Login Nodes > use “run” command

    • source activate env_name
  3. use

Take care to be connected to a computing node, if on the cluster: srun --time 4-00 --mem=8G --ntasks=8 --pty bash -i

Code Snippets

11
12
13
14
15
shell:
   """
   snpEff -Xmx8g -v hg19kg {input} | 
   SnpSift filter "!(ALT = '.') & isVariant(GEN[0])" /dev/stdin > {output}
   """
12
13
14
15
shell:
   "snpEff -Xmx8g -v hg19kg {input} | "
   "SnpSift annotate -exists -v gnomad resources/gnomad.exomes.r2.0.2.sites.vcf.gz /dev/stdin | "
   "SnpSift filter -n ' (exists ID) & !( ID = 'gnomad' )' /dev/stdin > {output}"
9
shell:"SnpSift filter '(GEN[0].VF > 0.05)'  {input} > {output}"
12
13
script:
    'workflow/scripts/pdf_merge.py'
7
8
shell:
    "python workflow/scripts/plot_vf_distribution.py {input.vcf} {output.png}"
6
7
script:
    '/fast/projects/hyperpanel/work/Hyperpanel/workflow/scripts/png_to_pdf_script.py'
6
7
script:
    '/fast/projects/hyperpanel/work/Hyperpanel/workflow/scripts/png_to_pdf_script.py'
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import vcf
# The calculated values need to be adjusted depending on the Format of the vcf File
# Read the VCF file
vcf_reader = vcf.Reader(filename="0103T.vcf")

# Create a new VCF writer to write the output
vcf_writer = vcf.Writer(open("0103T_AAF_ann.vcf", "w"), vcf_reader)

# Loop through the variants
for variant in vcf_reader:
    # Check if DP and AO values are present
    if variant.INFO.get("DP") is not None and variant.INFO.get("AO") is not None:
        # Calculate the AAF value
        AAF = int(variant.INFO["AO"]) / int(variant.INFO["DP"])

        # Add the AAF value as a new INFO field
        variant.INFO["AAF"] = AAF

    # Write the variant to the output file
    vcf_writer.write_record(variant)

# Close the VCF writer
vcf_writer.close()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import vcf

# Reading the vcf file
#The calculated values need to be adjusted depending on the Format of the vcf File
vcf_reader = vcf.Reader(filename="0103T.vcf")

# Open a new vcf file to write the filtered variants
with open("0103T_pyvcf_filtered.vcf", "w") as filtered_file:
    # Create a vcf writer
    vcf_writer = vcf.Writer(filtered_file, vcf_reader)

    # Loop through the variants
    for record in vcf_reader:
        # Get the DP and AD values
        DP = record.INFO.get("DP")
        AD = record.INFO.get("AD")

        # Check if the DP and AD values are available
        if DP is not None and AD is not None:
            # Calculate the AAF
            aaf = AD/DP

            # Check if the AAF value is greater than 0.05
            if aaf > 0.05:
                # Write the variant to the filtered vcf file
                vcf_writer.write_record(record)

# Close the vcf writer
vcf_writer.close()
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import vcf
import matplotlib.pyplot as plt

# Reading the vcf file
#  The calculated values need to be adjusted depending on the Format of the vcf File
vcf_reader = vcf.Reader(filename="input.vcf")

# List to store the AAF values
aaf_values = []

# Loop through the variants
for record in vcf_reader:
    # Get the DP and AO values
    DP = record.INFO.get("DP")
    AO = record.INFO.get("AO")

    # Check if the DP and AO values are available
    if DP is not None and AO is not None:
        # Calculate the AAF
        aaf = AO/DP
        aaf_values.append(aaf)

# Plotting the AAF distribution
plt.hist(aaf_values, bins=50, edgecolor="black")

# Adding labels and title to the plot
plt.xlabel("Allele Frequency")
plt.ylabel("Count")
plt.title("Allele Frequency Distribution")

# Saving the plot as a PNG image
plt.savefig("aaf_distribution.png")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from PyPDF2 import PdfFileReader, PdfFileWriter
import os

def merge_pdfs(paths, output_filename):
    pdf_writer = PdfFileWriter()

    for path in paths:
        pdf_reader = PdfFileReader(path)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page))

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

if __name__ == '__main__':
    # Extract the paths from the snakemake object
    pdfs1 = [snakemake.input.file1, snakemake.input.file2, snakemake.input.file3]
    pdfs2 = [snakemake.input.file4, snakemake.input.file5]
    pdfs3 = [snakemake.input.file6, snakemake.input.file7]

    merge_pdfs(pdfs1, output_filename='temp1.pdf')
    merge_pdfs(pdfs2, output_filename='temp2.pdf')
    merge_pdfs(pdfs3, output_filename='temp3.pdf')

    final_pdfs = ['temp1.pdf', 'temp2.pdf', 'temp3.pdf']
    merge_pdfs(final_pdfs, snakemake.output.merged)

    # Cleanup temporary files
    for pdf in final_pdfs:
        os.remove(pdf)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import vcf
import matplotlib.pyplot as plt
import argparse

def plot_af_distribution(input_vcf, output_png):
    # Reading the vcf file
    vcf_reader = vcf.Reader(filename=input_vcf)

    # List to store the AF values
    af_values = []

    # Loop through the records
    for record in vcf_reader:
        if "AF" in record.samples[0].data._fields:
            af = record.samples[0].data.AF
            if af is not None and isinstance(af, (float, int)):
                af_values.append(af)

    # Plotting the AF values
    plt.hist(af_values, bins=50, edgecolor="black")

    # Adding labels and title to the plot
    plt.xlabel("Allele Frequency (AF)")
    plt.ylabel("Count")
    plt.title("Allele Frequency (AF) Distribution")

    # Saving the plot as a PNG image
    plt.savefig(output_png)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Plot AF distribution from VCF")
    parser.add_argument("input_vcf", help="Input VCF file")
    parser.add_argument("output_png", help="Output PNG file for the plot")
    args = parser.parse_args()

    plot_af_distribution(args.input_vcf, args.output_png)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import vcf
import matplotlib.pyplot as plt
import argparse

def plot_vf_distribution(input_vcf, output_png):
    # Reading the vcf file
    vcf_reader = vcf.Reader(filename=input_vcf)

    # List to store the VF values
    vf_values = []

    # Loop through the records
    for record in vcf_reader:
        if "VF" in record.samples[0].data._fields:
            vf = record.samples[0].data.VF
            if vf is not None and isinstance(vf, (float, int)):
                vf_values.append(vf)

    # Plotting the VF values
    plt.hist(vf_values, bins=50, edgecolor="black")

    # Adding labels and title to the plot
    plt.xlabel("Variant Frequency (VF)")
    plt.ylabel("Count")
    plt.title("Variant Frequency (VF) Distribution")

    # Saving the plot as a PNG image
    plt.savefig(output_png)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Plot VF distribution from VCF")
    parser.add_argument("input_vcf", help="Input VCF file")
    parser.add_argument("output_png", help="Output PNG file for the plot")
    args = parser.parse_args()

    plot_vf_distribution(args.input_vcf, args.output_png)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
from PIL import Image

def png_to_pdf(png_path, output_pdf_path):
    # Open the image
    image = Image.open(png_path)

    # Convert to 'RGB' before saving as PDF to avoid potential errors
    img_rgb = image.convert('RGB')

    # Save as a PDF
    img_rgb.save(output_pdf_path)

if __name__ == '__main__':
    png_to_pdf(snakemake.input.png, snakemake.output.pdf)
ShowHide 9 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/mohamed6777/hyperpanel
Name: hyperpanel
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...