Snakemake workflow to analyse hematological malignancies in whole genome data

public 1yr ago Version: 1.0.1 0 bookmarks

View Workflow

Snakemake workflow to analyse hematological malignancies in whole genome data

:speech_balloon: Introduction

This snakemake workflow uses modules from hydragenetics to process .fastq files and obtain different kind of variant

Code Snippets

shell:
    "(cnvkit.py call {input.segment} "
    "-v {input.vcf} "
    "-o {output.segment}) &> {log}"

SnakeMake From line 31 of rules/cnvkit.smk

shell:
    """
    if [ -z {params.gene} ]
    then
    cnvkit.py scatter -s {input.segments} {input.segment_regions}  -v {input.vcf}  -o {output.plot} {params.extra}
    else
    cnvkit.py scatter -s {input.segments} {input.segment_regions}  -v {input.vcf}  -o {output.plot} -g {params.gene} {params.extra}
    fi
    """

SnakeMake From line 65 of rules/cnvkit.smk

script:
    "../scripts/cnvkit_to_table.py"

SnakeMake From line 45 of rules/cnvkit_table.smk

shell:
    """
    samtools view -F 1024 -c -e \
    '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{{3}}")' \
    {input.bam} {params.region} > {output.cnt}
    """

SnakeMake SAMtools From line 33 of rules/dux4_rearrangements.smk

shell:
    """
    samtools view -F 1024 -c -e \
    '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{{3}}")' \
    {input.bam} {params.region} > {output.cnt}
    """

SnakeMake SAMtools From line 67 of rules/dux4_rearrangements.smk

script:
    "../scripts/fix_af.py"

SnakeMake From line 24 of rules/fix_af.smk

shell:
    "(gatk --java-options '-Xmx32g' CollectAllelicCounts "
    "-L {input.interval} "
    "-I {input.bam} "
    "-R {input.ref} "
    "-O {output}"
    "{params.extra}) &> {log}"

SnakeMake From line 37 of rules/gatk_cnv_denoise_read_counts_by_sex.smk

shell:
    """
    if [ $(awk -F "," '(NR>1) {{print $7}}' {input.sex}) == "male" ]
    then 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_m} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    elif [ $(awk -F "," '(NR>1) {{print $7}}' {input.sex}) == "female" ]
    then
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_f} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    else
    if [ $(awk -F "," '(NR>1) {{print $2}}' {input.sex}) == "male" ]
    then 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_m} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    else 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_f} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    fi
    fi
    """

SnakeMake From line 77 of rules/gatk_cnv_denoise_read_counts_by_sex.smk

shell:
    "(gatk --java-options '-Xmx16g' ModelSegments "
    "--denoised-copy-ratios {input.denoisedCopyRatio} "
    "--allelic-counts {input.allelicCounts} "
    "--output {params.outdir} "
    "--output-prefix {params.outprefix}"
    "{params.extra}) &> {log}"

SnakeMake From line 153 of rules/gatk_cnv_denoise_read_counts_by_sex.smk

script:
    "../scripts/merge_cnv_json.py"

SnakeMake From line 43 of rules/html_output.smk

script:
    "../scripts/manta_to_tsv.py"

SnakeMake From line 30 of rules/manta_to_tsv.smk

script:
    "../scripts/manta_to_tsv.py"

SnakeMake From line 57 of rules/manta_to_tsv.smk

script:
    "../scripts/manta_to_tsv.indeldup.py"

SnakeMake From line 86 of rules/manta_to_tsv.smk

script:
    "../scripts/mutectcaller_to_tsv.py"

SnakeMake From line 33 of rules/mutectcaller_to_tsv.smk

script:
    "../scripts/mutectcaller_to_tsv.py"

SnakeMake From line 63 of rules/mutectcaller_to_tsv.smk

script:
    "../scripts/peddy_create_ped.py"

SnakeMake From line 30 of rules/peddy_create_ped.smk

shell:
    """
    cat {input.sex} | awk 'NR==1; !/^sample_id/' > {output.peddy_sex_check}
    cat {input.fam} > {output.ped}
    """

SnakeMake From line 63 of rules/peddy_create_ped.smk

script:
    "../scripts/create_peddy_mqc_config.py"

SnakeMake From line 94 of rules/peddy_create_ped.smk

script:
    "../scripts/sample_order_multiqc.py"

SnakeMake From line 36 of rules/sample_order_multiqc.smk

import sys
import xlsxwriter


def float_or_na(value):
    return float(value) if value != '' else None


# import bedfile for CNA genes from bedfile.
bedtable = []
bed_header = ['chr', 'start', 'end', 'annotation']
with open(snakemake.input.gene_interest) as bedfile:
    for line in bedfile:
        bedtable.append(line.strip().split('\t'))

# Import genomic pos to cytoCoord translation to list
chr_bands = []
with open(snakemake.input.cyto, 'r') as chr_band_file:
    for line in chr_band_file:
        chr_bands.append(line.split("\t"))

chromosomes = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY']
relevant_cnvs = {i: [] for i in chromosomes}
relevant_cnvs_header = [
    'ChromosomeBands',
    'Chromosome',
    'Start',
    'End',
    'Log2',
    'BAF',
    "CopyNumber",
    'CopyAllel1',
    'CopyAllel2',
    'Depth',
    'Probes',
    'Weight'
]
# import pdb; pdb.set_trace()
with open(snakemake.input.cns, 'r+') as cnsfile:
    cns_header = next(cnsfile).rstrip().split("\t")
    for cnv_line in cnsfile:
        cnv = cnv_line.strip().split("\t")
        if not (((int(snakemake.params.ploidy) % 2) == 0 and int(snakemake.params.ploidy) == int(cnv[cns_header.index('cn')])
                 and cnv[cns_header.index('cn1')] == cnv[cns_header.index('cn2')])
                or ((int(snakemake.params.ploidy) % 2) != 0 and int(snakemake.params.ploidy) == int(cnv[cns_header.index('cn')])
                    and cnv[cns_header.index('cn1')]+1 == cnv[cns_header.index('cn2')])):
            cnv_chr = cnv[cns_header.index('chromosome')]
            cnv_start = int(cnv[cns_header.index('start')])
            cnv_end = int(cnv[cns_header.index('end')])
            cnv_baf = float_or_na(cnv[cns_header.index('baf')])

            # Translate genomic coordinate to cytogen coordinates
            cyto_coord = ['', '']
            for chr_band in chr_bands:
                if chr_band[0] == cnv_chr:
                    if (cnv_start >= int(chr_band[1]) and cnv_start <= int(chr_band[2])):
                        cyto_coord[0] = chr_band[3]
                    if (cnv_end >= int(chr_band[1]) and cnv_end <= int(chr_band[2])):
                        cyto_coord[1] = chr_band[3]
            if cyto_coord[0] == cyto_coord[1]:
                cyto_coordinates = cnv_chr[3:]+cyto_coord[0]
            else:
                cyto_coordinates = cnv_chr[3:]+cyto_coord[0]+'-'+cyto_coord[1]

            if (cnv_end - cnv_start) >= 100000:
                outline = [cyto_coordinates, cnv_chr, cnv_start, cnv_end, float(cnv[cns_header.index('log2')]), cnv_baf,
                           cnv[cns_header.index('cn')], cnv[cns_header.index('cn1')], cnv[cns_header.index('cn2')],
                           cnv[cns_header.index('depth')], cnv[cns_header.index('probes')], cnv[cns_header.index('weight')]]
                relevant_cnvs[cnv_chr].append(outline)
                continue
            else:
                for bedline in bedtable:
                    if (cnv_chr == bedline[bed_header.index('chr')]):
                        bed_start = int(bedline[bed_header.index('start')])
                        bed_end = int(bedline[bed_header.index('end')])
                        if ((cnv_start >= bed_start and cnv_end <= bed_end) or
                           (cnv_start >= bed_start and cnv_start <= bed_end) or
                           (cnv_end >= bed_start and cnv_end <= bed_end) or
                           (cnv_start <= bed_start and cnv_end >= bed_end)):
                            outline = [cyto_coordinates, cnv_chr, cnv_start, cnv_end, float(cnv[cns_header.index('log2')]),
                                       cnv_baf, cnv[cns_header.index('cn')], cnv[cns_header.index('cn1')],
                                       cnv[cns_header.index('cn2')], cnv[cns_header.index('depth')],
                                       cnv[cns_header.index('probes')], cnv[cns_header.index('weight')]]
                            relevant_cnvs[cnv_chr].append(outline)
                            break

''' Creating xlsx file '''
low_log = float(snakemake.params.log.split(',')[0])
high_log = float(snakemake.params.log.split(',')[1])
sample = str(snakemake.input.cns).split("/")[-1].split("_")[0]
workbook = xlsxwriter.Workbook(snakemake.output[0])
heading_format = workbook.add_format({'bold': True, 'font_size': 18})
tablehead_format = workbook.add_format({'bold': True, 'text_wrap': True})
red_format = workbook.add_format({'font_color': 'red', 'italic': True})
worksheet_whole = workbook.add_worksheet("Whole genome")
worksheet_whole.set_column('B:E', 10)
worksheet_whole.write('A1', 'CNVkit calls', heading_format)
worksheet_whole.write('A3', 'Sample: ' + str(sample))
worksheet_whole.write('A4', 'Tumor content: ' + str(snakemake.params.tc))
worksheet_whole.write('A6', 'Calls larger then 100kb or in CNA bedfile included')
worksheet_whole.write('A7', 'CNV calls with log2 values inbetween ' + str(low_log) + ' and ' + str(high_log) +
                      ' are to be considered uncertain.', red_format)
image_path = snakemake.params.cnvkit_scatter_folder + '/' + sample + '_T'+'.png'
worksheet_whole.insert_image('A9', image_path)
worksheet_whole.write('A31', 'CNA bedfile: '+snakemake.input.gene_interest)
# create sheets for each chromosome
for chromosome in chromosomes:
    worksheet = workbook.add_worksheet(chromosome)
    worksheet.set_column('C:F', 10)
    worksheet.write('A1', 'CNVkit calls for '+chromosome, heading_format)
    worksheet.write('A3', 'Sample: '+str(sample))
    worksheet.write('A5', 'Calls larger then 100kb or in CNA bedfile included')
    worksheet.write('A6', 'CNV calls with log2 values inbetween ' + str(low_log) + ' and ' + str(high_log) +
                    ' are to be considered uncertain.', red_format)
    image_path = snakemake.params.cnvkit_scatter_folder + '/' + sample + '_T_'+chromosome+'.png'
    worksheet.insert_image('A8', image_path)
    worksheet.write_row('A30', relevant_cnvs_header, tablehead_format)
    row = 30
    col = 0
    for line in relevant_cnvs[chromosome]:
        if (low_log < float(line[4]) < high_log):
            worksheet.write_row(row, col, line, red_format)
        else:
            worksheet.write_row(row, col, line)
        row += 1

workbook.close()

Python XlsxWriter From line 3 of scripts/cnvkit_to_table.py

import pandas as pd
import yaml
import numpy as np
import sys


def comment_the_config_keys(config_dict):

    commented_config = '\n'.join(
        ['# ' + line for line in yaml.dump(config_dict).rstrip('\n').split('\n')])

    return commented_config


def get_sex_check_df(sex_check_file_path):
    sex_check_df = pd.read_csv(sex_check_file_path)
    sex_check_df["sex_check_test"] = np.where(~sex_check_df.error,
                                              'Pass', 'Fail')  # report peddy  error check as pass/fail for simplicity
    sex_check_df.rename(columns={'sample_id': 'Sample'}, inplace=True)

    return sex_check_df


def write_peddy_mqc(peddy_df, peddy_config, outfile):

    with open(outfile, 'w') as outfile:
        print(comment_the_config_keys(peddy_config), file=outfile)

        peddy_df.to_csv(outfile, sep='\t', mode='a', index=False)


def main():

    try:

        config = snakemake.config.get("peddy", '').get("config", '')

        with open(config, 'r') as report_configs:
            peddy_mqc_configs = yaml.load(report_configs, Loader=yaml.FullLoader)

            sex_check_df = get_sex_check_df(snakemake.input.peddy_sex_check)
            peddy_sex_config = peddy_mqc_configs.get('peddy_sex_check')
            write_peddy_mqc(sex_check_df, peddy_sex_config, snakemake.output.sex_check_mqc)

    except FileNotFoundError:
        sys.exit('Path to peddy config file not found/specified in config.yaml')


if __name__ == '__main__':
    main()

Python Pandas numpy PyYAML peddy From line 4 of scripts/create_peddy_mqc_config.py

__author__ = "Martin Rippin"
__copyright__ = "Copyright 2021, Martin Rippin"
__email__ = "[email protected]"
__license__ = "GPL-3"


import logging
import pysam
import re
import sys
import gzip


# identify caller software from input path
def getCaller(path: str):
    pathParts = path.split("/")
    if len(pathParts) == 3:
        return pathParts[1]
    else:
        raise ValueError(
            "{} is not a valid input for this script. Required:"
            "'snv_indels/caller/sample_type.vcf'.".format(path)
        )


# modify vcf header if necessary
def modifyHeader(caller: str, header: pysam.libcbcf.VariantHeader):
    if (caller == "pisces" or caller == "gatk_mutect2" or
       caller == "pbrun_deepvariant" or caller == "pbrun_mutectcaller_t"):
        header.info.add("AF", "A", "Float", "DescriptionDescription")
    elif caller == "varscan":
        header.info.add(
            "AF",
            "A",
            "Float",
            (
                "Allel count divided on depth"
                "(Quality of bases: Phred score >= 15)"
            )
        )
    return header


# fix af field in freebayes vcf entries
def fixFreebayes(
        header: pysam.libcbcf.VariantHeader,
        row: pysam.libcbcf.VariantRecord
):
    sample = header.samples[0]
    ads = row.samples[sample].get("AD")
    af = []
    for ad in ads:
        af.append(ad/sum(ads))
    return tuple(af[1:])


# loop through input vcf and write modified entries to new vcf
def writeNewVcf(
        path: str,
        header: pysam.libcbcf.VariantHeader,
        vcf: pysam.libcbcf.VariantFile,
        caller: str
):
    new_vcf = pysam.VariantFile(path, "w", header=header)
    for row in vcf.fetch():
        if caller == "freebayes":
            row.info["AF"] = fixFreebayes(header, row)
        elif caller == "haplotypecaller":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "gatk_mutect2":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pisces":
            row.info["AF"] = row.samples[0].get("VF")
        elif caller == "vardict":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pbrun_mutectcaller_t":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pbrun_deepvariant":
            row.info["AF"] = row.samples[0].get("VAF")
        elif caller == "gatk_select_variants_final":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "varscan":
            row.info["AF"] = row.samples[0].get("AD")/row.samples[0].get("DP")
        else:
            raise ValueError(
                "{} is not a valid caller for this script. Choose between: "
                "freebayes, haplotypecaller, gatk_mutect2, gatk_select_variants_final, "
                "pbrun_deepvariant, pisces, vardict, varscan.".format(caller)
            )
        new_vcf.write(row)
    return


# call function
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, filename=snakemake.log[0])
    logging.info("Read %s", snakemake.input.vcf)
    vcf = pysam.VariantFile(snakemake.input.vcf)
    logging.info("Determine caller...")
    caller = getCaller(snakemake.input.vcf)
    logging.info("Caller is %s", caller)
    logging.info("Add info to header if necessary")
    header = modifyHeader(caller, vcf.header)
    logging.info("Start writing to %s", snakemake.output.vcf)
    writeNewVcf(snakemake.output.vcf, header, vcf, caller)
    logging.info(
        "Successfully written vcf file %s",
        snakemake.output.vcf,
    )

Python pysam FreeBayes VarScan vardict From line 4 of scripts/fix_af.py

import csv
import sys
import vcf

input_file = snakemake.input.vcf
output_file_dels = snakemake.output.dels
output_file_dup = snakemake.output.dups
output_file_ins = snakemake.output.ins


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_dels, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "POSITION2", "LENGTH", "MANTAID", "GENES", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaDEL" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore",
                                                    "Ploidy", "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT", "HomRef"]
                                            for xxx in row.FILTER):
            genes = row.INFO["ANN"][0].split("|")
            dellength = row.INFO["SVLEN"]
            pos2 = row.INFO["END"]
            manta_id = ":".join(row.ID.split(":")[0:2])
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength[0] <= -100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.CHROM + ":" + str(pos2),
                                    str(dellength)[1:-1], manta_id, genes[3] + "(" + genes[4] + ")",
                                    row.FILTER, str(pr_formatted), str(sr_formatted)])


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_dup, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "POSITION2", "LENGTH", "MANTAID", "GENES", "HOMLENGTH", "HOMSEQ", "ANNOTATIONINFO",
                         "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaDUP" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore", "Ploidy",
                                                    "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT",
                                                    "HomRef"] for xxx in row.FILTER):
            genes = row.INFO["ANN"][0].split("|")
            dellength = row.INFO["SVLEN"]
            pos2 = row.INFO["END"]
            manta_id = ":".join(row.ID.split(":")[0:3])
            try:
                homlen = row.INFO["HOMLEN"]
                homseq = row.INFO["HOMSEQ"]
            except KeyError:
                homlen = "NA"
                homseq = "NA"
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength[0] >= 100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.CHROM + ":" + str(pos2), str(dellength)[1:-1],
                                    manta_id, genes[3] + "(" + genes[4] + ")", str(homlen)[1:-1], str(homseq)[1:-1],
                                    row.FILTER, str(pr_formatted), str(sr_formatted)])


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_ins, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION", "REFERENCE", "ALTERNATIVE", "LENGTH", "MANTAID", "GENES",
                         "HOMLENGTH", "HOMSEQ", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaINS" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore", "Ploidy",
                                                    "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT", "HomRef"]
                                            for xxx in row.FILTER):
            try:
                genes = row.INFO["ANN"][0].split("|")
            except KeyError:
                genes = ["NA", "NA", "NA", "NA", "NA"]
            try:
                dellength = row.INFO["SVLEN"]
            except KeyError:
                dellength = "NA"
            try:
                homlen = row.INFO["HOMLEN"]
                homseq = row.INFO["HOMSEQ"]
            except KeyError:
                homlen = "NA"
                homseq = "NA"
            manta_id = ":".join(row.ID.split(":")[0:2])
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength == "NA" or dellength[0] >= 100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.REF, str(row.ALT)[1:-1], str(dellength)[1:-1],
                                     manta_id, genes[3] + "(" + genes[4] + ")", str(homlen)[1:-1], str(homseq)[1:-1],
                                     row.FILTER, str(pr_formatted), str(sr_formatted)])

Python PyVCF From line 4 of scripts/manta_to_tsv.indeldup.py

import csv
import sys
import vcf

input_file = snakemake.input.vcf
output_file = snakemake.output.tsv

vcf = vcf.Reader(open(input_file, "r"))

with open(output_file, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "MANTAID", "BREAKEND", "GENES", "DEPTH", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in vcf:
        if "MantaBND" in row.ID and not any(x in ["MinQUAL", "MinGQ", "MinSomaticScore",
                                                  "Ploidy", "MaxDepth", "MaxMQ0Frac", "NoPairSupport",
                                                  "SampleFT", "HomRef"] for x in row.FILTER):
            try:
                genes = row.INFO["ANN"][0].split("|")
            except KeyError:
                genes = ["NA", "NA", "NA", "NA", "NA"]
            manta_id = ":".join(row.ID.split(":")[0:2])
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            tsv_writer.writerow([row.CHROM + ":" + str(row.POS), manta_id, str(row.ALT)[1:-1],
                                genes[3] + "(" + genes[4] + ")", row.INFO["BND_DEPTH"],
                                row.FILTER, str(pr_formatted), str(sr_formatted)])

Python PyVCF From line 4 of scripts/manta_to_tsv.py

from collections import defaultdict
from collections.abc import Generator
from dataclasses import dataclass
import json
from pathlib import Path
import pysam
import sys
from typing import Union
import cyvcf2


@dataclass
class CNV:
    caller: str
    chromosome: str
    genes: list
    start: int
    length: int
    type: str
    copy_number: float
    baf: float

    def end(self):
        return self.start + self.length - 1

    def overlaps(self, other):
        return self.chromosome == other.chromosome and (
            # overlaps in the beginning, or self contained in other
            (self.start >= other.start and self.start <= other.end())
            or
            # overlaps at the end, or self contained in other
            (self.end() >= other.start and self.end() <= other.end())
            or
            # other is contained in self
            (other.start >= self.start and other.end() <= self.end())
        )

    def __hash__(self):
        return hash(f"{self.caller}_{self.chromosome}:{self.start}-{self.end()}_{self.copy_number}")


def parse_fai(filename, skip=None):
    with open(filename) as f:
        for line in f:
            chrom, length = line.strip().split()[:2]
            if skip is not None and chrom in skip:
                continue
            yield chrom, int(length)


def parse_annotation_bed(filename, skip=None):
    with open(filename) as f:
        for line in f:
            chrom, start, end, name = line.strip().split()[:4]
            if skip is not None and chrom in skip:
                continue
            yield chrom, int(start), int(end), name


def get_vaf(vcf_filename, skip=None):
    vcf = cyvcf2.VCF(vcf_filename)
    for variant in vcf:
        if skip is not None and variant.CHROM in skip:
            continue
        yield variant.CHROM, variant.POS, variant.INFO.get("AF", None)


def get_cnvs(vcf_filename, skip=None):
    cnvs = defaultdict(lambda: defaultdict(list))
    vcf = pysam.VariantFile(vcf_filename)
    for variant in vcf.fetch():
        if skip is not None and variant.chrom in skip:
            continue
        caller = variant.info.get("CALLER")
        if caller is None:
            raise KeyError("could not find caller information for variant, has the vcf been annotated?")
        genes = variant.info.get("Genes")
        if genes is None:
            continue
        if isinstance(genes, str):
            genes = [genes]
        cnv = CNV(
            caller,
            variant.chrom,
            sorted(genes),
            variant.pos,
            variant.info.get("SVLEN"),
            variant.info.get("SVTYPE"),
            variant.info.get("CORR_CN"),
            variant.info.get("BAF"),
        )
        cnvs[variant.chrom][caller].append(cnv)
    return cnvs


def merge_cnv_dicts(dicts, vaf, annotations, chromosomes, filtered_cnvs, unfiltered_cnvs):
    callers = list(map(lambda x: x["caller"], dicts))
    caller_labels = dict(
        cnvkit="cnvkit",
        gatk="GATK",
    )
    cnvs = {}
    for chrom, chrom_length in chromosomes:
        cnvs[chrom] = dict(
            chromosome=chrom,
            label=chrom,
            length=chrom_length,
            vaf=[],
            annotations=[],
            callers={c: dict(name=c, label=caller_labels.get(c, c), ratios=[], segments=[], cnvs=[]) for c in callers},
        )

    for a in annotations:
        for item in a:
            cnvs[item[0]]["annotations"].append(
                dict(
                    start=item[1],
                    end=item[2],
                    name=item[3],
                )
            )

    if vaf is not None:
        for v in vaf:
            cnvs[v[0]]["vaf"].append(
                dict(
                    pos=v[1],
                    vaf=v[2],
                )
            )

    # Iterate over the unfiltered CNVs and pair them according to overlap.
    for uf_cnvs, f_cnvs in zip(unfiltered_cnvs, filtered_cnvs):
        for chrom, cnvdict in uf_cnvs.items():
            callers = sorted(list(cnvdict.keys()))
            first_caller = callers[0]
            rest_callers = callers[1:]

            # Keep track of added CNVs on a chromosome to avoid duplicates
            added_cnvs = set()

            for cnv1 in cnvdict[first_caller]:
                pass_filter = False

                if cnv1 in f_cnvs[chrom][first_caller]:
                    # The CNV is part of the filtered set, so all overlapping
                    # CNVs should pass the filter.
                    pass_filter = True

                cnv_group = [cnv1]
                for caller2 in rest_callers:
                    for cnv2 in cnvdict[caller2]:
                        if cnv1.overlaps(cnv2):
                            # Add overlapping CNVs from other callers
                            cnv_group.append(cnv2)

                            if cnv2 in f_cnvs[chrom][caller2]:
                                # If the overlapping CNV is part of the filtered
                                # set, the whole group should pass the filter.
                                pass_filter = True

                for c in cnv_group:
                    if c in added_cnvs:
                        continue
                    cnvs[c.chromosome]["callers"][c.caller]["cnvs"].append(
                        dict(
                            genes=c.genes,
                            start=c.start,
                            length=c.length,
                            type=c.type,
                            cn=c.copy_number,
                            baf=c.baf,
                            passed_filter=pass_filter,
                        )
                    )
                    added_cnvs.add(c)

    for d in dicts:
        for r in d["ratios"]:
            cnvs[r["chromosome"]]["callers"][d["caller"]]["ratios"].append(
                dict(
                    start=r["start"],
                    end=r["end"],
                    log2=r["log2"],
                )
            )
        for s in d["segments"]:
            cnvs[s["chromosome"]]["callers"][d["caller"]]["segments"].append(
                dict(
                    start=s["start"],
                    end=s["end"],
                    log2=s["log2"],
                )
            )

    for v in cnvs.values():
        v["callers"] = list(v["callers"].values())

    return list(cnvs.values())


def main():
    log = Path(snakemake.log[0])

    logfile = open(log, "w")
    sys.stdout = sys.stderr = logfile

    annotation_beds = snakemake.input["annotation_bed"]
    fasta_index_file = snakemake.input["fai"]
    germline_vcf = snakemake.input["germline_vcf"]
    json_files = snakemake.input["json"]
    filtered_cnv_vcf_files = snakemake.input["filtered_cnv_vcfs"]
    cnv_vcf_files = snakemake.input["cnv_vcfs"]

    if len(germline_vcf) == 0:
        germline_vcf = None

    output_file = snakemake.output["json"]

    skip_chromosomes = snakemake.params["skip_chromosomes"]

    cnv_dicts = []
    for fname in json_files:
        with open(fname) as f:
            cnv_dicts.append(json.load(f))

    fai = parse_fai(fasta_index_file, skip_chromosomes)
    vaf = None
    if germline_vcf is not None:
        vaf = get_vaf(germline_vcf, skip_chromosomes)
    annotations = []
    for filename in annotation_beds:
        annotations.append(parse_annotation_bed(filename, skip_chromosomes))

    filtered_cnv_vcfs = []
    unfiltered_cnv_vcfs = []
    for f_vcf, uf_vcf in zip(filtered_cnv_vcf_files, cnv_vcf_files):
        filtered_cnv_vcfs.append(get_cnvs(f_vcf, skip_chromosomes))
        unfiltered_cnv_vcfs.append(get_cnvs(uf_vcf, skip_chromosomes))

    cnvs = merge_cnv_dicts(cnv_dicts, vaf, annotations, fai, filtered_cnv_vcfs, unfiltered_cnv_vcfs)

    with open(output_file, "w") as f:
        print(json.dumps(cnvs), file=f)


if __name__ == "__main__":
    main()

Python JSON pysam cyvcf2 CNVkit From line 1 of scripts/merge_cnv_json.py

import csv
import sys
import vcf

input_file = snakemake.input.vcf
analysis = snakemake.params.analysis
sample = snakemake.params.sample
output_file = snakemake.output.tsv

vcf = vcf.Reader(open(input_file, "r"))

with open(output_file, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#SAMPLE", "CHROMOSOME", "POSITION", "REFERENCE", "ALTERNATIVE", "ALLELEFREQUENCY", "DEPTH", "GENE",
                         "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", "PROTEINSEQ", "CONSEQUENCE"])
    for row in vcf:
        genes = row.INFO["CSQ"][0].split("|")
        transcript_field = genes[10].split(":")
        if len(transcript_field) == 1:
            transcript_field = ["", ""]
        protein_field = genes[11].split(":")
        if len(protein_field) == 1:
            protein_field = [genes[24], ""]
        i = 0
        if analysis == "tn":
            i = 1
        af = row.samples[i]["AF"]
        tsv_writer.writerow([sample, row.CHROM, row.POS, row.REF, row.ALT, af, row.INFO["DP"], genes[3], transcript_field[0],
                             transcript_field[1], protein_field[0], protein_field[1], genes[1]])

Python PyVCF From line 4 of scripts/mutectcaller_to_tsv.py

import sys


input_file = snakemake.input[0]


with open(input_file, 'r') as samplesheet:
    next(samplesheet)
    for lline in samplesheet:
        line = lline.strip().split("\t")
        if line[3] == "M":
            sex = "1"
        elif line[3] == "K":
            sex = "2"
        else:
            sex = "0"
        with open("qc/peddy/" + line[0] + ".peddy.fam", 'w+') as pedfile:
            pedfile.write("\t".join([line[0], line[0] + "_T", "0", "0", sex, "-9"])+"\n")

Python From line 5 of scripts/peddy_create_ped.py

import sys
import csv


header_row = "Sample_ID,Sample_Name,Description,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project\n"
header_row_split = header_row.strip().split(",")

samples = {}
header = False
with open(snakemake.input.sample_sheet, 'r') as samplesheet:
    for lline in samplesheet:
        if len(lline.split()) == 0:  # skip blank lines
            continue
        if header:
            line = lline.strip().split(",")
            if line[7] == "WGSWP2":
                samples[line[0]] = {
                                        header_row_split[1]: line[1],
                                        "Type": line[2].split("_")[0],
                                        "Sex": line[2].split("_")[1],
                                        "Pedegree_id": line[2].split("_")[2],
                                        "Seq_run": line[2].split("_")[3],
                                        "Project": line[2].split("_")[4],
                                        header_row_split[3]: line[3],
                                        header_row_split[4]: line[4],
                                        header_row_split[5]: line[5],
                                        header_row_split[6]: line[6],
                                        header_row_split[7]: line[7],
                                    }
        if lline == header_row:
            header = True


if len(samples) == 0:
    raise Exception("No samples found, has the header in SampleSheet changed?")

# replacement files for rna and dna samples seperate to get sample order correct
with open(snakemake.output.replacement_dna, "w+") as replacement_tsv_dna, \
     open(snakemake.output.replacement_rna, "w+") as replacement_tsv_rna, \
     open(snakemake.output.order_dna, "w+") as order_tsv_dna, \
     open(snakemake.output.order_rna, "w+") as order_tsv_rna, \
     open(snakemake.output.dnanumber, "w+") as dna_table, \
     open(snakemake.output.rnanumber, "w+") as rna_table:
    order_tsv_dna.write("\t".join(["Sample Order", "Pedegree ID", "DNA number"])+"\n")
    order_tsv_rna.write("\t".join(["Sample Order", "Pedegree ID", "RNA number"])+"\n")
    dna_table.write("\t".join(["Sample", "ped_id", "dna_number"])+"\n")
    rna_table.write("\t".join(["Sample", "ped_id", "rna_number"])+"\n")
    i = 1
    j = 1
    for sample in samples.values():
        sample["Type"] = "R" if sample["Type"] == "Heltranskriptom" else sample["Type"]
        replacement_line = [sample["Pedegree_id"] + "_" + sample["Type"]]
        order_line = [sample["Pedegree_id"] + "_" + sample["Type"], sample["Sample_Name"]]
        if sample["Type"] == "T" or sample["Type"] == "N":
            replacement_tsv_dna.write("\t".join(replacement_line + ["sample_"+str(f"{i:03}")]) + "\n")
            order_tsv_dna.write("\t".join(["sample_"+str(f"{i:03}")] + order_line) + "\n")
            dna_table.write("\t".join(["sample_"+str(f"{i:03}")] + order_line)+"\n")
            i += 1
        elif sample["Type"] == "R":
            replacement_tsv_rna.write("\t".join(replacement_line + ["sample_"+str(f"{j:03}")]) + "\n")
            order_tsv_rna.write("\t".join(["sample_"+str(f"{j:03}")] + order_line) + "\n")
            rna_table.write("\t".join(["sample_"+str(f"{j:03}")] + order_line) + "\n")
            j += 1