Snakemake workflow to analyse hematological malignancies in whole genome data

public public 1yr ago Version: 1.0.1 0 bookmarks

Snakemake workflow to analyse hematological malignancies in whole genome data

:speech_balloon: Introduction

This snakemake workflow uses modules from hydragenetics to process .fastq files and obtain different kind of variant

Code Snippets

31
32
33
34
shell:
    "(cnvkit.py call {input.segment} "
    "-v {input.vcf} "
    "-o {output.segment}) &> {log}"
65
66
67
68
69
70
71
72
73
shell:
    """
    if [ -z {params.gene} ]
    then
    cnvkit.py scatter -s {input.segments} {input.segment_regions}  -v {input.vcf}  -o {output.plot} {params.extra}
    else
    cnvkit.py scatter -s {input.segments} {input.segment_regions}  -v {input.vcf}  -o {output.plot} -g {params.gene} {params.extra}
    fi
    """
45
46
script:
    "../scripts/cnvkit_to_table.py"
33
34
35
36
37
38
shell:
    """
    samtools view -F 1024 -c -e \
    '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{{3}}")' \
    {input.bam} {params.region} > {output.cnt}
    """
67
68
69
70
71
72
shell:
    """
    samtools view -F 1024 -c -e \
    '(rnext == "4" && pnext > 190173000 && pnext < 190176000) || ([SA] =~ "4,19017[345][0-9]{{3}}")' \
    {input.bam} {params.region} > {output.cnt}
    """
24
25
script:
    "../scripts/fix_af.py"
37
38
39
40
41
42
43
shell:
    "(gatk --java-options '-Xmx32g' CollectAllelicCounts "
    "-L {input.interval} "
    "-I {input.bam} "
    "-R {input.ref} "
    "-O {output}"
    "{params.extra}) &> {log}"
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
shell:
    """
    if [ $(awk -F "," '(NR>1) {{print $7}}' {input.sex}) == "male" ]
    then 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_m} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    elif [ $(awk -F "," '(NR>1) {{print $7}}' {input.sex}) == "female" ]
    then
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_f} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    else
    if [ $(awk -F "," '(NR>1) {{print $2}}' {input.sex}) == "male" ]
    then 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_m} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    else 
    (gatk --java-options '-Xmx7g' DenoiseReadCounts \
    -I {input.hdf5Tumor} \
    --count-panel-of-normals {input.hdf5PoN_f} \
    --standardized-copy-ratios {output.stdCopyRatio} \
    --denoised-copy-ratios {output.denoisedCopyRatio} \
    {params.extra}) &> {log}
    fi
    fi
    """
153
154
155
156
157
158
159
shell:
    "(gatk --java-options '-Xmx16g' ModelSegments "
    "--denoised-copy-ratios {input.denoisedCopyRatio} "
    "--allelic-counts {input.allelicCounts} "
    "--output {params.outdir} "
    "--output-prefix {params.outprefix}"
    "{params.extra}) &> {log}"
43
44
script:
    "../scripts/merge_cnv_json.py"
30
31
script:
    "../scripts/manta_to_tsv.py"
57
58
script:
    "../scripts/manta_to_tsv.py"
86
87
script:
    "../scripts/manta_to_tsv.indeldup.py"
33
34
script:
    "../scripts/mutectcaller_to_tsv.py"
63
64
script:
    "../scripts/mutectcaller_to_tsv.py"
30
31
script:
    "../scripts/peddy_create_ped.py"
63
64
65
66
67
shell:
    """
    cat {input.sex} | awk 'NR==1; !/^sample_id/' > {output.peddy_sex_check}
    cat {input.fam} > {output.ped}
    """
94
95
script:
    "../scripts/create_peddy_mqc_config.py"
36
37
script:
    "../scripts/sample_order_multiqc.py"
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import xlsxwriter


def float_or_na(value):
    return float(value) if value != '' else None


# import bedfile for CNA genes from bedfile.
bedtable = []
bed_header = ['chr', 'start', 'end', 'annotation']
with open(snakemake.input.gene_interest) as bedfile:
    for line in bedfile:
        bedtable.append(line.strip().split('\t'))

# Import genomic pos to cytoCoord translation to list
chr_bands = []
with open(snakemake.input.cyto, 'r') as chr_band_file:
    for line in chr_band_file:
        chr_bands.append(line.split("\t"))

chromosomes = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY']
relevant_cnvs = {i: [] for i in chromosomes}
relevant_cnvs_header = [
    'ChromosomeBands',
    'Chromosome',
    'Start',
    'End',
    'Log2',
    'BAF',
    "CopyNumber",
    'CopyAllel1',
    'CopyAllel2',
    'Depth',
    'Probes',
    'Weight'
]
# import pdb; pdb.set_trace()
with open(snakemake.input.cns, 'r+') as cnsfile:
    cns_header = next(cnsfile).rstrip().split("\t")
    for cnv_line in cnsfile:
        cnv = cnv_line.strip().split("\t")
        if not (((int(snakemake.params.ploidy) % 2) == 0 and int(snakemake.params.ploidy) == int(cnv[cns_header.index('cn')])
                 and cnv[cns_header.index('cn1')] == cnv[cns_header.index('cn2')])
                or ((int(snakemake.params.ploidy) % 2) != 0 and int(snakemake.params.ploidy) == int(cnv[cns_header.index('cn')])
                    and cnv[cns_header.index('cn1')]+1 == cnv[cns_header.index('cn2')])):
            cnv_chr = cnv[cns_header.index('chromosome')]
            cnv_start = int(cnv[cns_header.index('start')])
            cnv_end = int(cnv[cns_header.index('end')])
            cnv_baf = float_or_na(cnv[cns_header.index('baf')])

            # Translate genomic coordinate to cytogen coordinates
            cyto_coord = ['', '']
            for chr_band in chr_bands:
                if chr_band[0] == cnv_chr:
                    if (cnv_start >= int(chr_band[1]) and cnv_start <= int(chr_band[2])):
                        cyto_coord[0] = chr_band[3]
                    if (cnv_end >= int(chr_band[1]) and cnv_end <= int(chr_band[2])):
                        cyto_coord[1] = chr_band[3]
            if cyto_coord[0] == cyto_coord[1]:
                cyto_coordinates = cnv_chr[3:]+cyto_coord[0]
            else:
                cyto_coordinates = cnv_chr[3:]+cyto_coord[0]+'-'+cyto_coord[1]

            if (cnv_end - cnv_start) >= 100000:
                outline = [cyto_coordinates, cnv_chr, cnv_start, cnv_end, float(cnv[cns_header.index('log2')]), cnv_baf,
                           cnv[cns_header.index('cn')], cnv[cns_header.index('cn1')], cnv[cns_header.index('cn2')],
                           cnv[cns_header.index('depth')], cnv[cns_header.index('probes')], cnv[cns_header.index('weight')]]
                relevant_cnvs[cnv_chr].append(outline)
                continue
            else:
                for bedline in bedtable:
                    if (cnv_chr == bedline[bed_header.index('chr')]):
                        bed_start = int(bedline[bed_header.index('start')])
                        bed_end = int(bedline[bed_header.index('end')])
                        if ((cnv_start >= bed_start and cnv_end <= bed_end) or
                           (cnv_start >= bed_start and cnv_start <= bed_end) or
                           (cnv_end >= bed_start and cnv_end <= bed_end) or
                           (cnv_start <= bed_start and cnv_end >= bed_end)):
                            outline = [cyto_coordinates, cnv_chr, cnv_start, cnv_end, float(cnv[cns_header.index('log2')]),
                                       cnv_baf, cnv[cns_header.index('cn')], cnv[cns_header.index('cn1')],
                                       cnv[cns_header.index('cn2')], cnv[cns_header.index('depth')],
                                       cnv[cns_header.index('probes')], cnv[cns_header.index('weight')]]
                            relevant_cnvs[cnv_chr].append(outline)
                            break

''' Creating xlsx file '''
low_log = float(snakemake.params.log.split(',')[0])
high_log = float(snakemake.params.log.split(',')[1])
sample = str(snakemake.input.cns).split("/")[-1].split("_")[0]
workbook = xlsxwriter.Workbook(snakemake.output[0])
heading_format = workbook.add_format({'bold': True, 'font_size': 18})
tablehead_format = workbook.add_format({'bold': True, 'text_wrap': True})
red_format = workbook.add_format({'font_color': 'red', 'italic': True})
worksheet_whole = workbook.add_worksheet("Whole genome")
worksheet_whole.set_column('B:E', 10)
worksheet_whole.write('A1', 'CNVkit calls', heading_format)
worksheet_whole.write('A3', 'Sample: ' + str(sample))
worksheet_whole.write('A4', 'Tumor content: ' + str(snakemake.params.tc))
worksheet_whole.write('A6', 'Calls larger then 100kb or in CNA bedfile included')
worksheet_whole.write('A7', 'CNV calls with log2 values inbetween ' + str(low_log) + ' and ' + str(high_log) +
                      ' are to be considered uncertain.', red_format)
image_path = snakemake.params.cnvkit_scatter_folder + '/' + sample + '_T'+'.png'
worksheet_whole.insert_image('A9', image_path)
worksheet_whole.write('A31', 'CNA bedfile: '+snakemake.input.gene_interest)
# create sheets for each chromosome
for chromosome in chromosomes:
    worksheet = workbook.add_worksheet(chromosome)
    worksheet.set_column('C:F', 10)
    worksheet.write('A1', 'CNVkit calls for '+chromosome, heading_format)
    worksheet.write('A3', 'Sample: '+str(sample))
    worksheet.write('A5', 'Calls larger then 100kb or in CNA bedfile included')
    worksheet.write('A6', 'CNV calls with log2 values inbetween ' + str(low_log) + ' and ' + str(high_log) +
                    ' are to be considered uncertain.', red_format)
    image_path = snakemake.params.cnvkit_scatter_folder + '/' + sample + '_T_'+chromosome+'.png'
    worksheet.insert_image('A8', image_path)
    worksheet.write_row('A30', relevant_cnvs_header, tablehead_format)
    row = 30
    col = 0
    for line in relevant_cnvs[chromosome]:
        if (low_log < float(line[4]) < high_log):
            worksheet.write_row(row, col, line, red_format)
        else:
            worksheet.write_row(row, col, line)
        row += 1

workbook.close()
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import yaml
import numpy as np
import sys


def comment_the_config_keys(config_dict):

    commented_config = '\n'.join(
        ['# ' + line for line in yaml.dump(config_dict).rstrip('\n').split('\n')])

    return commented_config


def get_sex_check_df(sex_check_file_path):
    sex_check_df = pd.read_csv(sex_check_file_path)
    sex_check_df["sex_check_test"] = np.where(~sex_check_df.error,
                                              'Pass', 'Fail')  # report peddy  error check as pass/fail for simplicity
    sex_check_df.rename(columns={'sample_id': 'Sample'}, inplace=True)

    return sex_check_df


def write_peddy_mqc(peddy_df, peddy_config, outfile):

    with open(outfile, 'w') as outfile:
        print(comment_the_config_keys(peddy_config), file=outfile)

        peddy_df.to_csv(outfile, sep='\t', mode='a', index=False)


def main():

    try:

        config = snakemake.config.get("peddy", '').get("config", '')

        with open(config, 'r') as report_configs:
            peddy_mqc_configs = yaml.load(report_configs, Loader=yaml.FullLoader)

            sex_check_df = get_sex_check_df(snakemake.input.peddy_sex_check)
            peddy_sex_config = peddy_mqc_configs.get('peddy_sex_check')
            write_peddy_mqc(sex_check_df, peddy_sex_config, snakemake.output.sex_check_mqc)

    except FileNotFoundError:
        sys.exit('Path to peddy config file not found/specified in config.yaml')


if __name__ == '__main__':
    main()
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
__author__ = "Martin Rippin"
__copyright__ = "Copyright 2021, Martin Rippin"
__email__ = "[email protected]"
__license__ = "GPL-3"


import logging
import pysam
import re
import sys
import gzip


# identify caller software from input path
def getCaller(path: str):
    pathParts = path.split("/")
    if len(pathParts) == 3:
        return pathParts[1]
    else:
        raise ValueError(
            "{} is not a valid input for this script. Required:"
            "'snv_indels/caller/sample_type.vcf'.".format(path)
        )


# modify vcf header if necessary
def modifyHeader(caller: str, header: pysam.libcbcf.VariantHeader):
    if (caller == "pisces" or caller == "gatk_mutect2" or
       caller == "pbrun_deepvariant" or caller == "pbrun_mutectcaller_t"):
        header.info.add("AF", "A", "Float", "DescriptionDescription")
    elif caller == "varscan":
        header.info.add(
            "AF",
            "A",
            "Float",
            (
                "Allel count divided on depth"
                "(Quality of bases: Phred score >= 15)"
            )
        )
    return header


# fix af field in freebayes vcf entries
def fixFreebayes(
        header: pysam.libcbcf.VariantHeader,
        row: pysam.libcbcf.VariantRecord
):
    sample = header.samples[0]
    ads = row.samples[sample].get("AD")
    af = []
    for ad in ads:
        af.append(ad/sum(ads))
    return tuple(af[1:])


# loop through input vcf and write modified entries to new vcf
def writeNewVcf(
        path: str,
        header: pysam.libcbcf.VariantHeader,
        vcf: pysam.libcbcf.VariantFile,
        caller: str
):
    new_vcf = pysam.VariantFile(path, "w", header=header)
    for row in vcf.fetch():
        if caller == "freebayes":
            row.info["AF"] = fixFreebayes(header, row)
        elif caller == "haplotypecaller":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "gatk_mutect2":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pisces":
            row.info["AF"] = row.samples[0].get("VF")
        elif caller == "vardict":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pbrun_mutectcaller_t":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "pbrun_deepvariant":
            row.info["AF"] = row.samples[0].get("VAF")
        elif caller == "gatk_select_variants_final":
            row.info["AF"] = row.samples[0].get("AF")
        elif caller == "varscan":
            row.info["AF"] = row.samples[0].get("AD")/row.samples[0].get("DP")
        else:
            raise ValueError(
                "{} is not a valid caller for this script. Choose between: "
                "freebayes, haplotypecaller, gatk_mutect2, gatk_select_variants_final, "
                "pbrun_deepvariant, pisces, vardict, varscan.".format(caller)
            )
        new_vcf.write(row)
    return


# call function
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, filename=snakemake.log[0])
    logging.info("Read %s", snakemake.input.vcf)
    vcf = pysam.VariantFile(snakemake.input.vcf)
    logging.info("Determine caller...")
    caller = getCaller(snakemake.input.vcf)
    logging.info("Caller is %s", caller)
    logging.info("Add info to header if necessary")
    header = modifyHeader(caller, vcf.header)
    logging.info("Start writing to %s", snakemake.output.vcf)
    writeNewVcf(snakemake.output.vcf, header, vcf, caller)
    logging.info(
        "Successfully written vcf file %s",
        snakemake.output.vcf,
    )
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import csv
import sys
import vcf

input_file = snakemake.input.vcf
output_file_dels = snakemake.output.dels
output_file_dup = snakemake.output.dups
output_file_ins = snakemake.output.ins


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_dels, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "POSITION2", "LENGTH", "MANTAID", "GENES", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaDEL" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore",
                                                    "Ploidy", "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT", "HomRef"]
                                            for xxx in row.FILTER):
            genes = row.INFO["ANN"][0].split("|")
            dellength = row.INFO["SVLEN"]
            pos2 = row.INFO["END"]
            manta_id = ":".join(row.ID.split(":")[0:2])
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength[0] <= -100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.CHROM + ":" + str(pos2),
                                    str(dellength)[1:-1], manta_id, genes[3] + "(" + genes[4] + ")",
                                    row.FILTER, str(pr_formatted), str(sr_formatted)])


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_dup, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "POSITION2", "LENGTH", "MANTAID", "GENES", "HOMLENGTH", "HOMSEQ", "ANNOTATIONINFO",
                         "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaDUP" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore", "Ploidy",
                                                    "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT",
                                                    "HomRef"] for xxx in row.FILTER):
            genes = row.INFO["ANN"][0].split("|")
            dellength = row.INFO["SVLEN"]
            pos2 = row.INFO["END"]
            manta_id = ":".join(row.ID.split(":")[0:3])
            try:
                homlen = row.INFO["HOMLEN"]
                homseq = row.INFO["HOMSEQ"]
            except KeyError:
                homlen = "NA"
                homseq = "NA"
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength[0] >= 100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.CHROM + ":" + str(pos2), str(dellength)[1:-1],
                                    manta_id, genes[3] + "(" + genes[4] + ")", str(homlen)[1:-1], str(homseq)[1:-1],
                                    row.FILTER, str(pr_formatted), str(sr_formatted)])


datei = vcf.Reader(open(input_file, "r"))

with open(output_file_ins, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION", "REFERENCE", "ALTERNATIVE", "LENGTH", "MANTAID", "GENES",
                         "HOMLENGTH", "HOMSEQ", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in datei:
        if "MantaINS" in row.ID and not any(xxx in ["MinQUAL", "MinGQ", "MinSomaticScore", "Ploidy",
                                                    "MaxDepth", "MaxMQ0Frac", "NoPairSupport", "SampleFT", "HomRef"]
                                            for xxx in row.FILTER):
            try:
                genes = row.INFO["ANN"][0].split("|")
            except KeyError:
                genes = ["NA", "NA", "NA", "NA", "NA"]
            try:
                dellength = row.INFO["SVLEN"]
            except KeyError:
                dellength = "NA"
            try:
                homlen = row.INFO["HOMLEN"]
                homseq = row.INFO["HOMSEQ"]
            except KeyError:
                homlen = "NA"
                homseq = "NA"
            manta_id = ":".join(row.ID.split(":")[0:2])
            # get frequency of paired and split alternate reads
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            if dellength == "NA" or dellength[0] >= 100:
                tsv_writer.writerow([row.CHROM + ":" + str(row.POS), row.REF, str(row.ALT)[1:-1], str(dellength)[1:-1],
                                     manta_id, genes[3] + "(" + genes[4] + ")", str(homlen)[1:-1], str(homseq)[1:-1],
                                     row.FILTER, str(pr_formatted), str(sr_formatted)])
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import csv
import sys
import vcf

input_file = snakemake.input.vcf
output_file = snakemake.output.tsv

vcf = vcf.Reader(open(input_file, "r"))

with open(output_file, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#POSITION1", "MANTAID", "BREAKEND", "GENES", "DEPTH", "ANNOTATIONINFO", "PR_ALT_FREQ", "SR_ALT_FREQ"])
    for row in vcf:
        if "MantaBND" in row.ID and not any(x in ["MinQUAL", "MinGQ", "MinSomaticScore",
                                                  "Ploidy", "MaxDepth", "MaxMQ0Frac", "NoPairSupport",
                                                  "SampleFT", "HomRef"] for x in row.FILTER):
            try:
                genes = row.INFO["ANN"][0].split("|")
            except KeyError:
                genes = ["NA", "NA", "NA", "NA", "NA"]
            manta_id = ":".join(row.ID.split(":")[0:2])
            last_sample_index = len(row.samples) - 1
            last_sample = row.samples[last_sample_index]
            pr_values = last_sample.data.PR if hasattr(last_sample.data, 'PR') else None
            sr_values = last_sample.data.SR if hasattr(last_sample.data, 'SR') else None
            if pr_values:
                pr_denominator, pr_numerator = pr_values
                pr_frequency = pr_numerator / (pr_denominator + pr_numerator) if pr_denominator + pr_numerator != 0 else None
            else:
                pr_frequency = None

            if sr_values:
                sr_denominator, sr_numerator = sr_values
                sr_frequency = sr_numerator / (sr_denominator + sr_numerator) if sr_denominator + sr_numerator != 0 else None
            else:
                sr_frequency = None

            # Format frequencies only if they are not None
            pr_formatted = f"{pr_frequency:.4f}" if pr_frequency is not None else 'NA'
            sr_formatted = f"{sr_frequency:.4f}" if sr_frequency is not None else 'NA'
            tsv_writer.writerow([row.CHROM + ":" + str(row.POS), manta_id, str(row.ALT)[1:-1],
                                genes[3] + "(" + genes[4] + ")", row.INFO["BND_DEPTH"],
                                row.FILTER, str(pr_formatted), str(sr_formatted)])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from collections import defaultdict
from collections.abc import Generator
from dataclasses import dataclass
import json
from pathlib import Path
import pysam
import sys
from typing import Union
import cyvcf2


@dataclass
class CNV:
    caller: str
    chromosome: str
    genes: list
    start: int
    length: int
    type: str
    copy_number: float
    baf: float

    def end(self):
        return self.start + self.length - 1

    def overlaps(self, other):
        return self.chromosome == other.chromosome and (
            # overlaps in the beginning, or self contained in other
            (self.start >= other.start and self.start <= other.end())
            or
            # overlaps at the end, or self contained in other
            (self.end() >= other.start and self.end() <= other.end())
            or
            # other is contained in self
            (other.start >= self.start and other.end() <= self.end())
        )

    def __hash__(self):
        return hash(f"{self.caller}_{self.chromosome}:{self.start}-{self.end()}_{self.copy_number}")


def parse_fai(filename, skip=None):
    with open(filename) as f:
        for line in f:
            chrom, length = line.strip().split()[:2]
            if skip is not None and chrom in skip:
                continue
            yield chrom, int(length)


def parse_annotation_bed(filename, skip=None):
    with open(filename) as f:
        for line in f:
            chrom, start, end, name = line.strip().split()[:4]
            if skip is not None and chrom in skip:
                continue
            yield chrom, int(start), int(end), name


def get_vaf(vcf_filename, skip=None):
    vcf = cyvcf2.VCF(vcf_filename)
    for variant in vcf:
        if skip is not None and variant.CHROM in skip:
            continue
        yield variant.CHROM, variant.POS, variant.INFO.get("AF", None)


def get_cnvs(vcf_filename, skip=None):
    cnvs = defaultdict(lambda: defaultdict(list))
    vcf = pysam.VariantFile(vcf_filename)
    for variant in vcf.fetch():
        if skip is not None and variant.chrom in skip:
            continue
        caller = variant.info.get("CALLER")
        if caller is None:
            raise KeyError("could not find caller information for variant, has the vcf been annotated?")
        genes = variant.info.get("Genes")
        if genes is None:
            continue
        if isinstance(genes, str):
            genes = [genes]
        cnv = CNV(
            caller,
            variant.chrom,
            sorted(genes),
            variant.pos,
            variant.info.get("SVLEN"),
            variant.info.get("SVTYPE"),
            variant.info.get("CORR_CN"),
            variant.info.get("BAF"),
        )
        cnvs[variant.chrom][caller].append(cnv)
    return cnvs


def merge_cnv_dicts(dicts, vaf, annotations, chromosomes, filtered_cnvs, unfiltered_cnvs):
    callers = list(map(lambda x: x["caller"], dicts))
    caller_labels = dict(
        cnvkit="cnvkit",
        gatk="GATK",
    )
    cnvs = {}
    for chrom, chrom_length in chromosomes:
        cnvs[chrom] = dict(
            chromosome=chrom,
            label=chrom,
            length=chrom_length,
            vaf=[],
            annotations=[],
            callers={c: dict(name=c, label=caller_labels.get(c, c), ratios=[], segments=[], cnvs=[]) for c in callers},
        )

    for a in annotations:
        for item in a:
            cnvs[item[0]]["annotations"].append(
                dict(
                    start=item[1],
                    end=item[2],
                    name=item[3],
                )
            )

    if vaf is not None:
        for v in vaf:
            cnvs[v[0]]["vaf"].append(
                dict(
                    pos=v[1],
                    vaf=v[2],
                )
            )

    # Iterate over the unfiltered CNVs and pair them according to overlap.
    for uf_cnvs, f_cnvs in zip(unfiltered_cnvs, filtered_cnvs):
        for chrom, cnvdict in uf_cnvs.items():
            callers = sorted(list(cnvdict.keys()))
            first_caller = callers[0]
            rest_callers = callers[1:]

            # Keep track of added CNVs on a chromosome to avoid duplicates
            added_cnvs = set()

            for cnv1 in cnvdict[first_caller]:
                pass_filter = False

                if cnv1 in f_cnvs[chrom][first_caller]:
                    # The CNV is part of the filtered set, so all overlapping
                    # CNVs should pass the filter.
                    pass_filter = True

                cnv_group = [cnv1]
                for caller2 in rest_callers:
                    for cnv2 in cnvdict[caller2]:
                        if cnv1.overlaps(cnv2):
                            # Add overlapping CNVs from other callers
                            cnv_group.append(cnv2)

                            if cnv2 in f_cnvs[chrom][caller2]:
                                # If the overlapping CNV is part of the filtered
                                # set, the whole group should pass the filter.
                                pass_filter = True

                for c in cnv_group:
                    if c in added_cnvs:
                        continue
                    cnvs[c.chromosome]["callers"][c.caller]["cnvs"].append(
                        dict(
                            genes=c.genes,
                            start=c.start,
                            length=c.length,
                            type=c.type,
                            cn=c.copy_number,
                            baf=c.baf,
                            passed_filter=pass_filter,
                        )
                    )
                    added_cnvs.add(c)

    for d in dicts:
        for r in d["ratios"]:
            cnvs[r["chromosome"]]["callers"][d["caller"]]["ratios"].append(
                dict(
                    start=r["start"],
                    end=r["end"],
                    log2=r["log2"],
                )
            )
        for s in d["segments"]:
            cnvs[s["chromosome"]]["callers"][d["caller"]]["segments"].append(
                dict(
                    start=s["start"],
                    end=s["end"],
                    log2=s["log2"],
                )
            )

    for v in cnvs.values():
        v["callers"] = list(v["callers"].values())

    return list(cnvs.values())


def main():
    log = Path(snakemake.log[0])

    logfile = open(log, "w")
    sys.stdout = sys.stderr = logfile

    annotation_beds = snakemake.input["annotation_bed"]
    fasta_index_file = snakemake.input["fai"]
    germline_vcf = snakemake.input["germline_vcf"]
    json_files = snakemake.input["json"]
    filtered_cnv_vcf_files = snakemake.input["filtered_cnv_vcfs"]
    cnv_vcf_files = snakemake.input["cnv_vcfs"]

    if len(germline_vcf) == 0:
        germline_vcf = None

    output_file = snakemake.output["json"]

    skip_chromosomes = snakemake.params["skip_chromosomes"]

    cnv_dicts = []
    for fname in json_files:
        with open(fname) as f:
            cnv_dicts.append(json.load(f))

    fai = parse_fai(fasta_index_file, skip_chromosomes)
    vaf = None
    if germline_vcf is not None:
        vaf = get_vaf(germline_vcf, skip_chromosomes)
    annotations = []
    for filename in annotation_beds:
        annotations.append(parse_annotation_bed(filename, skip_chromosomes))

    filtered_cnv_vcfs = []
    unfiltered_cnv_vcfs = []
    for f_vcf, uf_vcf in zip(filtered_cnv_vcf_files, cnv_vcf_files):
        filtered_cnv_vcfs.append(get_cnvs(f_vcf, skip_chromosomes))
        unfiltered_cnv_vcfs.append(get_cnvs(uf_vcf, skip_chromosomes))

    cnvs = merge_cnv_dicts(cnv_dicts, vaf, annotations, fai, filtered_cnv_vcfs, unfiltered_cnv_vcfs)

    with open(output_file, "w") as f:
        print(json.dumps(cnvs), file=f)


if __name__ == "__main__":
    main()
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import csv
import sys
import vcf

input_file = snakemake.input.vcf
analysis = snakemake.params.analysis
sample = snakemake.params.sample
output_file = snakemake.output.tsv

vcf = vcf.Reader(open(input_file, "r"))

with open(output_file, "wt") as tsv:
    tsv_writer = csv.writer(tsv, delimiter='\t')
    tsv_writer.writerow(["#SAMPLE", "CHROMOSOME", "POSITION", "REFERENCE", "ALTERNATIVE", "ALLELEFREQUENCY", "DEPTH", "GENE",
                         "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", "PROTEINSEQ", "CONSEQUENCE"])
    for row in vcf:
        genes = row.INFO["CSQ"][0].split("|")
        transcript_field = genes[10].split(":")
        if len(transcript_field) == 1:
            transcript_field = ["", ""]
        protein_field = genes[11].split(":")
        if len(protein_field) == 1:
            protein_field = [genes[24], ""]
        i = 0
        if analysis == "tn":
            i = 1
        af = row.samples[i]["AF"]
        tsv_writer.writerow([sample, row.CHROM, row.POS, row.REF, row.ALT, af, row.INFO["DP"], genes[3], transcript_field[0],
                             transcript_field[1], protein_field[0], protein_field[1], genes[1]])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
import sys


input_file = snakemake.input[0]


with open(input_file, 'r') as samplesheet:
    next(samplesheet)
    for lline in samplesheet:
        line = lline.strip().split("\t")
        if line[3] == "M":
            sex = "1"
        elif line[3] == "K":
            sex = "2"
        else:
            sex = "0"
        with open("qc/peddy/" + line[0] + ".peddy.fam", 'w+') as pedfile:
            pedfile.write("\t".join([line[0], line[0] + "_T", "0", "0", sex, "-9"])+"\n")
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import sys
import csv


header_row = "Sample_ID,Sample_Name,Description,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project\n"
header_row_split = header_row.strip().split(",")

samples = {}
header = False
with open(snakemake.input.sample_sheet, 'r') as samplesheet:
    for lline in samplesheet:
        if len(lline.split()) == 0:  # skip blank lines
            continue
        if header:
            line = lline.strip().split(",")
            if line[7] == "WGSWP2":
                samples[line[0]] = {
                                        header_row_split[1]: line[1],
                                        "Type": line[2].split("_")[0],
                                        "Sex": line[2].split("_")[1],
                                        "Pedegree_id": line[2].split("_")[2],
                                        "Seq_run": line[2].split("_")[3],
                                        "Project": line[2].split("_")[4],
                                        header_row_split[3]: line[3],
                                        header_row_split[4]: line[4],
                                        header_row_split[5]: line[5],
                                        header_row_split[6]: line[6],
                                        header_row_split[7]: line[7],
                                    }
        if lline == header_row:
            header = True


if len(samples) == 0:
    raise Exception("No samples found, has the header in SampleSheet changed?")

# replacement files for rna and dna samples seperate to get sample order correct
with open(snakemake.output.replacement_dna, "w+") as replacement_tsv_dna, \
     open(snakemake.output.replacement_rna, "w+") as replacement_tsv_rna, \
     open(snakemake.output.order_dna, "w+") as order_tsv_dna, \
     open(snakemake.output.order_rna, "w+") as order_tsv_rna, \
     open(snakemake.output.dnanumber, "w+") as dna_table, \
     open(snakemake.output.rnanumber, "w+") as rna_table:
    order_tsv_dna.write("\t".join(["Sample Order", "Pedegree ID", "DNA number"])+"\n")
    order_tsv_rna.write("\t".join(["Sample Order", "Pedegree ID", "RNA number"])+"\n")
    dna_table.write("\t".join(["Sample", "ped_id", "dna_number"])+"\n")
    rna_table.write("\t".join(["Sample", "ped_id", "rna_number"])+"\n")
    i = 1
    j = 1
    for sample in samples.values():
        sample["Type"] = "R" if sample["Type"] == "Heltranskriptom" else sample["Type"]
        replacement_line = [sample["Pedegree_id"] + "_" + sample["Type"]]
        order_line = [sample["Pedegree_id"] + "_" + sample["Type"], sample["Sample_Name"]]
        if sample["Type"] == "T" or sample["Type"] == "N":
            replacement_tsv_dna.write("\t".join(replacement_line + ["sample_"+str(f"{i:03}")]) + "\n")
            order_tsv_dna.write("\t".join(["sample_"+str(f"{i:03}")] + order_line) + "\n")
            dna_table.write("\t".join(["sample_"+str(f"{i:03}")] + order_line)+"\n")
            i += 1
        elif sample["Type"] == "R":
            replacement_tsv_rna.write("\t".join(replacement_line + ["sample_"+str(f"{j:03}")]) + "\n")
            order_tsv_rna.write("\t".join(["sample_"+str(f"{j:03}")] + order_line) + "\n")
            rna_table.write("\t".join(["sample_"+str(f"{j:03}")] + order_line) + "\n")
            j += 1
ShowHide 22 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/clinical-genomics-uppsala/fluffy_hematology_wgs
Name: fluffy_hematology_wgs
Version: 1.0.1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: GNU General Public License v3.0
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...