A snakemake workflow for benchmarking variant calling approaches with Genome in a Bottle (GIAB), CHM (syndip) or other custom datasets

public 1yr ago Version: v1.8.3 0 bookmarks

View Workflow

A Snakemake workflow for benchmarking variant calling approaches with Genome in a Bottle (GIAB) data (and other custom benchmark datasets). The workflow uses a combination of bedtools, mosdepth, rtg-tools, pandas and datavzrd.

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) benchmark-giabsitory and its DOI (see above).

Code Snippets

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

## Extract arguments
extra = snakemake.params.get("extra", "")

shell("bcftools index {extra} {snakemake.input[0]} {log}")

Python Snakemake BCFtools From line 1 of index/wrapper.py

__author__ = "William Rowell"
__copyright__ = "Copyright 2020, William Rowell"
__email__ = "[email protected]"
__license__ = "MIT"

import sys
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

bed = snakemake.input.get("bed", "")
by = snakemake.params.get("by", "")
if by:
    if bed:
        sys.exit(
            "Either provide a bed input file OR a window size via params.by, not both."
        )
    else:
        by = f"--by {by}"
if bed:
    by = f"--by {bed}"

quantize_out = False
thresholds_out = False
regions_bed_out = False
region_dist_out = False
for file in snakemake.output:
    if ".per-base." in file and "--no-per-base" in extra:
        sys.exit(
            "You asked not to generate per-base output (--no-per-base), but your rule specifies a '.per-base.' output file. Remove one of the two."
        )
    if ".quantized.bed.gz" in file:
        quantize_out = True
    if ".thresholds.bed.gz" in file:
        thresholds_out = True
    if ".mosdepth.region.dist.txt" in file:
        region_dist_out = True
    if ".regions.bed.gz" in file:
        regions_bed_out = True


if by and not regions_bed_out:
    sys.exit(
        "You ask for by-region output. Please also specify *.regions.bed.gz as a rule output."
    )

if by and not region_dist_out:
    sys.exit(
        "You ask for by-region output. Please also specify *.mosdepth.region.dist.txt as a rule output."
    )

if (region_dist_out or regions_bed_out) and not by:
    sys.exit(
        "You specify *.regions.bed.gz and/or *.mosdepth.region.dist.txt as a rule output. You also need to ask for by-region output via 'input.bed' or 'params.by'."
    )

quantize = snakemake.params.get("quantize", "")
if quantize:
    if not quantize_out:
        sys.exit(
            "You ask for quantized output via params.quantize. Please also specify *.quantized.bed.gz as a rule output."
        )
    quantize = f"--quantize {quantize}"

if not quantize and quantize_out:
    sys.exit(
        "The rule has output *.quantized.bed.gz specified. Please also specify params.quantize to actually generate it."
    )


thresholds = snakemake.params.get("thresholds", "")
if thresholds:
    if not thresholds_out:
        sys.exit(
            "You ask for --thresholds output via params.thresholds. Please also specify *.thresholds.bed.gz as a rule output."
        )
    thresholds = f"--thresholds {thresholds}"

if not thresholds and thresholds_out:
    sys.exit(
        "The rule has output *.thresholds.bed.gz specified. Please also specify params.thresholds to actually generate it."
    )


precision = snakemake.params.get("precision", "")
if precision:
    precision = f"MOSDEPTH_PRECISION={precision}"


fasta = snakemake.input.get("fasta", "")
if fasta:
    fasta = f"--fasta {fasta}"


# mosdepth takes additional threads through its option --threads
# One thread for mosdepth
# Other threads are *additional* decompression threads passed to the '--threads' argument
threads = "" if snakemake.threads <= 1 else "--threads {}".format(snakemake.threads - 1)


# named output summary = "*.mosdepth.summary.txt" is required
prefix = snakemake.output.summary.replace(".mosdepth.summary.txt", "")


shell(
    "({precision} mosdepth {threads} {fasta} {by} {quantize} {thresholds} {extra} {prefix} {snakemake.input.bam}) {log}"
)

Python Snakemake mosdepth From line 1 of mosdepth/wrapper.py

__author__ = "Johannes Köster, Christopher Schröder"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

log = snakemake.log_fmt_shell()

extra = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)

bams = snakemake.input.bams
if isinstance(bams, str):
    bams = [bams]
bams = list(map("--INPUT {}".format, bams))

if snakemake.output.bam.endswith(".cram"):
    output = "/dev/stdout"
    if snakemake.params.embed_ref:
        view_options = "-O cram,embed_ref"
    else:
        view_options = "-O cram"
    convert = f" | samtools view -@ {snakemake.threads} {view_options} --reference {snakemake.input.ref} -o {snakemake.output.bam}"
else:
    output = snakemake.output.bam
    convert = ""

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(picard MarkDuplicates"  # Tool and its subcommand
        " {java_opts}"  # Automatic java option
        " {extra}"  # User defined parmeters
        " {bams}"  # Input bam(s)
        " --TMP_DIR {tmpdir}"
        " --OUTPUT {output}"  # Output bam
        " --METRICS_FILE {snakemake.output.metrics}"  # Output metrics
        " {convert} ) {log}"  # Logging
    )

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of markduplicates/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-sequence/wrapper.py

__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of faidx/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools From line 1 of index/wrapper.py

__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)

Python Snakemake BWA From line 1 of index/wrapper.py

__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


from os import path
import re
import tempfile
from snakemake.shell import shell


# Extract arguments.
extra = snakemake.params.get("extra", "")

sort = snakemake.params.get("sorting", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")

index = snakemake.input.idx
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


if re.search(r"-T\b", sort_extra) or re.search(r"--TMP_DIR\b", sort_extra):
    sys.exit(
        "You have specified temp dir (`-T` or `--TMP_DIR`) in params.sort_extra; this is automatically set from params.tmp_dir."
    )

log = snakemake.log_fmt_shell(stdout=False, stderr=True)


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or " "2 (paired-end) elements")

if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))

# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":

    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view -Sbh -o {snakemake.output[0]} -"

elif sort == "samtools":

    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort -T {tmp} {sort_extra} -o {snakemake.output[0]} -"

elif sort == "picard":

    # Sort alignments using picard SortSam.
    pipe_cmd = (
        "picard SortSam {sort_extra} --INPUT /dev/stdin"
        " --OUTPUT {snakemake.output[0]} --SORT_ORDER {sort_order} --TMP_DIR {tmp}"
    )

else:
    raise ValueError("Unexpected value for params.sort ({})".format(sort))

with tempfile.TemporaryDirectory() as tmp:
    shell(
        "(bwa mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )

Python Snakemake SAMtools Picard From line 1 of mem/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(
    snakemake, parse_ref=False, parse_output_format=False, parse_memory=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


if "--tbi" in extra or "--csi" in extra:
    raise ValueError(
        "You have specified index format (`--tbi/--csi`) in `params.extra`; this is automatically infered from the first output file."
    )

if snakemake.output[0].endswith(".tbi"):
    extra += " --tbi"
elif snakemake.output[0].endswith(".csi"):
    extra += " --csi"
else:
    raise ValueError("invalid index file format ('.tbi', '.csi').")


shell("bcftools index {bcftools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of index/wrapper.py

__author__ = "Dayne Filer"
__copyright__ = "Copyright 2019, Dayne Filer"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts

bcftools_opts = get_bcftools_opts(snakemake, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell("bcftools norm {bcftools_opts} {extra} {snakemake.input[0]} {log}")

Python Snakemake BCFtools snakemake-wrapper-utils From line 1 of norm/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")

shell("datavzrd {snakemake.input.config} {extra} --output {snakemake.output[0]} {log}")

Python Snakemake From line 1 of datavzrd/wrapper.py

shell:
    "(set +o pipefail; samtools view -f3 -h"
    " {params.bam_url}"
    " {params.limit} |"
    " samtools sort -n -O BAM --threads {resources.sort_threads} | "
    " samtools fastq -1 {output.r1} -2 {output.r2} -s /dev/null -0 /dev/null -) 2> {log}"

SnakeMake SAMtools From line 16 of rules/download.smk

shell:
    "(mkdir -p {output}; curl -L {params.url} | tar -x -C {output} --strip-components 1) 2> {log}"

SnakeMake From line 34 of rules/download.smk

shell:
    "ls {input.archive}; (bcftools view {params.url}"
    " | sed {params.repl_chr} | bcftools view -Ob - > {output}"
    ") 2> {log}"

SnakeMake BCFtools From line 50 of rules/download.smk

shell:
    "bcftools concat -O b --allow-overlap {input} > {output} 2> {log}"

SnakeMake BCFtools From line 66 of rules/download.smk

wrapper:
    "v1.9.0/bio/bcftools/norm"

SnakeMake From line 81 of rules/download.smk

shell:
    "({params.cmd} | sed {params.repl_chr} > {output}) 2> {log}"

SnakeMake From line 97 of rules/download.smk

shell:
    "curl --insecure -L http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz > {output} 2> {log}"

SnakeMake From line 108 of rules/download.smk

shell:
    "({params.get_bed} {params.liftover}) 2> {log}"

SnakeMake From line 125 of rules/download.smk

shell:
    "sed {params.repl_chr} {input} > {output} 2> {log}"

SnakeMake From line 140 of rules/download.smk

wrapper:
    "v1.7.2/bio/reference/ensembl-sequence"

SnakeMake From line 155 of rules/download.smk

wrapper:
    "v1.7.2/bio/samtools/faidx"

SnakeMake From line 166 of rules/download.smk

wrapper:
    "v1.8.0/bio/bwa/index"

SnakeMake From line 179 of rules/download.smk

wrapper:
    "v1.8.0/bio/bwa/mem"

SnakeMake From line 195 of rules/download.smk

wrapper:
    "v1.7.2/bio/picard/markduplicates"

SnakeMake From line 211 of rules/download.smk

wrapper:
    "v1.7.2/bio/samtools/index"

SnakeMake From line 222 of rules/download.smk

wrapper:
    "v1.7.2/bio/mosdepth"

SnakeMake mosdepth From line 239 of rules/download.smk

shell:
    "bcftools annotate {input.calls} --rename-chrs {input.repl_file} "
    "-Ob -o {output} 2> {log}"

SnakeMake BCFtools From line 11 of rules/eval.smk

shell:
    "(bedtools intersect -b {input.regions} -a "
    "<(bcftools view {input.variants}) -wa -f 1.0 -header | "
    "bcftools view -Oz > {output}) 2> {log}"

SnakeMake BCFtools From line 37 of rules/eval.smk

wrapper:
    "v1.7.2/bio/bcftools/index"

SnakeMake From line 60 of rules/eval.smk

script:
    "../scripts/stat-truth.py"

SnakeMake From line 73 of rules/eval.smk

shell:
    "rtg format --output {output} {input.genome} &> {log}"

SnakeMake RTG Core From line 87 of rules/eval.smk

shell:
    "rm -r {params.output}; rtg vcfeval --threads {threads} --ref-overlap --all-records "
    "--output-mode ga4gh --baseline {input.truth} --calls {input.query} "
    "--output {params.output} --template {input.genome} &> {log}"

SnakeMake RTG Core From line 107 of rules/eval.smk

script:
    "../scripts/calc-precision-recall.py"

SnakeMake From line 124 of rules/eval.smk

script:
    "../scripts/collect-stratifications.py"

SnakeMake From line 143 of rules/eval.smk

script:
    "../scripts/collect-precision-recall.py"

SnakeMake From line 159 of rules/eval.smk

wrapper:
    "v2.1.1/utils/datavzrd"

SnakeMake From line 190 of rules/eval.smk

script:
    "../scripts/extract-fp-fn.py"

SnakeMake From line 204 of rules/eval.smk

script:
    "../scripts/collect-fp-fn.py"

SnakeMake From line 231 of rules/eval.smk

wrapper:
    "v2.1.1/utils/datavzrd"

SnakeMake From line 265 of rules/eval.smk

wrapper:
    "v1.9.0/bio/bcftools/index"

SnakeMake From line 8 of rules/utils.smk

wrapper:
    "v1.9.0/bio/bcftools/index"

SnakeMake From line 19 of rules/utils.smk

from collections import defaultdict
import sys, os

# sys.path.insert(0, os.path.dirname(__file__))
sys.stderr = open(snakemake.log[0], "w")

from abc import ABC, abstractmethod
from enum import Enum
import pandas as pd
import pysam

from common.classification import CompareExactGenotype, CompareExistence, Class


class Classifications:
    def __init__(self, comparator):
        self.tp_query = 0
        self.tp_truth = 0
        self.fn = 0
        self.fp = 0
        self.comparator = comparator

    def register(self, record):
        for c in self.comparator.classify(record):
            if c.cls is Class.TP_truth:
                self.tp_truth += 1
            elif c.cls is Class.TP_query:
                self.tp_query += 1
            elif c.cls is Class.FN:
                self.fn += 1
            elif c.cls is Class.FP:
                self.fp += 1
            else:
                assert False, "unexpected case"

    def precision(self):
        p = self.tp_query + self.fp
        if p == 0:
            return 1.0
        return float(self.tp_query) / float(p)

    def recall(self):
        t = self.tp_truth + self.fn
        if t == 0:
            return 1.0
        return float(self.tp_truth) / float(t)


def collect_results(vartype):
    classifications_exact = Classifications(CompareExactGenotype(vartype))
    classifications_existence = Classifications(CompareExistence(vartype))

    for record in pysam.VariantFile(snakemake.input.calls):
        classifications_exact.register(record)
        classifications_existence.register(record)

    vartype = "indels" if vartype == "INDEL" else "snvs"

    mismatched_genotype = (
        classifications_existence.tp_query - classifications_exact.tp_query
    )
    if classifications_existence.tp_query > 0:
        mismatched_genotype_rate = (
            mismatched_genotype / classifications_existence.tp_query
        )
    else:
        mismatched_genotype_rate = 0.0

    d = pd.DataFrame(
        {
            "precision": [classifications_existence.precision()],
            "tp_query": [classifications_existence.tp_query],
            "fp": [classifications_existence.fp],
            "recall": [classifications_existence.recall()],
            "tp_truth": [classifications_existence.tp_truth],
            "fn": [classifications_existence.fn],
            "genotype_mismatch_rate": [mismatched_genotype_rate],
        }
    )
    return d[
        [
            "precision",
            "tp_query",
            "fp",
            "recall",
            "tp_truth",
            "fn",
            "genotype_mismatch_rate",
        ]
    ]


assert snakemake.wildcards.vartype in ["snvs", "indels"]
vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL"

collect_results(vartype).to_csv(snakemake.output[0], sep="\t", index=False)

Python Pandas pysam common From line 1 of scripts/calc-precision-recall.py

import os
import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd
from scipy.cluster.hierarchy import ward, leaves_list
from scipy.spatial.distance import pdist
from sklearn.feature_selection import chi2
from statsmodels.stats.multitest import fdrcorrection
import numpy as np


def collect_chromosomes(f):
    d = pd.read_csv(f, sep="\t", dtype=str, usecols=["chromosome"]).drop_duplicates()
    return d["chromosome"].tolist()


def read_data(f, callset, chromosome=None):
    print("reading", f, "...", file=sys.stderr)
    data = pd.read_csv(
        f,
        sep="\t",
        dtype=str,
    )

    if chromosome is not None:
        data = data.loc[data["chromosome"] == chromosome]
    print(data.head(), file=sys.stderr)

    data = data.set_index(
        [
            "chromosome",
            "position",
            "ref_allele",
            "alt_allele",
            "true_genotype",
        ]
    )
    data.drop("class", axis="columns", inplace=True)

    assert (
        not data.index.duplicated().any()
    ), f"bug: not expecting any duplicates in FP/FN table {f}"

    data.columns = [callset]

    if snakemake.wildcards.classification == "fn":
        data.loc[:, callset] = "FN"
    return data


def get_idx_sorted_by_clustering(data):
    cluster_matrix = ward(pdist(~data.isna(), metric="hamming"))
    idx = leaves_list(cluster_matrix)
    return idx


chromosomes = sorted(
    {chrom for f in snakemake.input.tables for chrom in collect_chromosomes(f)}
)

if not chromosomes:
    chromosomes = [None]

n_written = 0

# process data for each chromosome separately and append to the same files
for i, chromosome in enumerate(chromosomes):
    if n_written > snakemake.params.max_entries:
        break

    data = pd.concat(
        [
            read_data(f, callset, chromosome)
            for f, callset in zip(snakemake.input.tables, snakemake.params.callsets)
        ],
        axis="columns",
    )

    data = data.loc[data.isna().sum(axis="columns").sort_values().index,]

    data = data.dropna(how="all")

    if not data.empty:
        idx_rows = get_idx_sorted_by_clustering(data)
        idx_cols = get_idx_sorted_by_clustering(data.T)
        data = data.iloc[idx_rows, idx_cols]

    label_df = pd.DataFrame(snakemake.params.labels)

    def store(data, output, label_idx=None):
        _label_df = label_df
        if label_idx is not None:
            _label_df = label_df.iloc[[label_idx]]

        # add labels
        index_cols = data.index.names
        cols = data.columns
        data = pd.concat([_label_df, data.reset_index()]).set_index(index_cols)
        # restore column order
        data = data[cols]

        if i == 0:
            mode = "w"
            header = True
        else:
            mode = "a"
            header = False
        data.to_csv(output, sep="\t", mode=mode, header=header)

    store(data, snakemake.output.main)
    n_written += len(data)

    label_df.index = snakemake.params.label_names

    os.makedirs(snakemake.output.dependency_sorting, exist_ok=True)

    # for each label, calculate mutual information and sort FP/FN entries in descending order
    for label_idx, label in enumerate(snakemake.params.label_names):
        outdata = data
        if not data.empty:
            # oe = OrdinalEncoder()
            # le = LabelEncoder()
            # feature matrix: genotypes, transposed into samples x features, replace NA with False (match)
            # and any genotype with True (mismatch with truth).
            feature_matrix = data.reset_index(drop=True).T.copy()
            feature_matrix[~pd.isna(feature_matrix)] = True
            feature_matrix[pd.isna(feature_matrix)] = False

            # target vector: label values, converted into factors
            target_vector = label_df.loc[label]

            # ignore any NA in the target vector and correspondingly remove the rows in the feature matrix
            not_na_target_vector = target_vector[~pd.isna(target_vector)]
            feature_matrix = feature_matrix.loc[not_na_target_vector.index]

            # perfom chi² test against each feature
            _, pvals = chi2(feature_matrix, not_na_target_vector)
            sorted_idx = np.argsort(pvals)

            _, fdr = fdrcorrection(
                pvals, method="negcorr"
            )  # use Benjamini/Yekutieli as variants might be both positively or negatively correlated

            # clone data
            sorted_data = data.copy(deep=True)

            # sort by label
            sorted_target_vector = target_vector.sort_values()
            sorted_data = sorted_data[sorted_target_vector.index]

            # add pvalue and FDR
            sorted_data.insert(0, "FDR dependency", np.around(fdr, 3))
            sorted_data.insert(0, "p-value dependency", np.around(pvals, 3))

            outdata = sorted_data.iloc[sorted_idx]

            # only keep the rather significant entries (but be a bit more permissive than 0.05)
            outdata = outdata.loc[outdata["p-value dependency"] <= 0.25]

        outpath = os.path.join(snakemake.output.dependency_sorting, f"{label}.tsv")
        store(outdata, outpath, label_idx=label_idx)

Python Pandas numpy scipy sklearn statsmodels From line 1 of scripts/collect-fp-fn.py

import sys
sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def load_data(path, callset):
    d = pd.read_csv(path, sep="\t")
    d.insert(0, "callset", callset)
    return d


results = pd.concat(
    [
        load_data(f, callset)
        for f, callset in zip(snakemake.input.tables, snakemake.params.callsets)
    ],
    axis="rows",
)

def cov_key(cov_label):
    # return lower bound as integer for sorting
    if ".." in cov_label:
        return int(cov_label.split("..")[0])
    else:
        return int(cov_label[1:])


def sort_key(col):
    if col.name == "callset":
        return col
    else:
        return col.apply(cov_key)


results.sort_values(["callset", "coverage"], inplace=True, key=sort_key)
results["sort_index"] = results["coverage"].apply(cov_key)

results.to_csv(snakemake.output[0], sep="\t", index=False)

Python Pandas From line 1 of scripts/collect-precision-recall.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import pandas as pd


def get_cov_label(coverage):
    lower = snakemake.params.coverage_lower_bounds[coverage]
    bounds = [
        bound
        for bound in snakemake.params.coverage_lower_bounds.values()
        if bound > lower
    ]
    if bounds:
        upper = min(bounds)
        return f"{lower}..{upper}"
    else:
        return f"≥{lower}"


def load_data(f, coverage):
    d = pd.read_csv(f, sep="\t")
    d.insert(0, "coverage", get_cov_label(coverage))
    return d


if snakemake.input:
    report = pd.concat(
        load_data(f, cov) for cov, f in zip(snakemake.params.coverages, snakemake.input)
    )

    if (report["tp_truth"] == 0).all():
        raise ValueError(
            f"The callset {snakemake.wildcards.callset} does not predict any variant from the truth. "
            "This is likely a technical issue in the callset and should be checked before further evaluation."
        )

    report.to_csv(snakemake.output[0], sep="\t", index=False)
else:
    pd.DataFrame(
        {
            col: []
            for col in [
                "coverage",
                "precision",
                "tp_query",
                "fp",
                "recall",
                "tp_truth",
                "fn",
                "genotype_mismatch_rate",
            ]
        }
    ).to_csv(snakemake.output[0], sep="\t")

Python Pandas From line 1 of scripts/collect-stratifications.py

from collections import defaultdict
import sys, os

# sys.path.insert(0, os.path.dirname(__file__))
sys.stderr = open(snakemake.log[0], "w")

import csv
import pysam

from common.classification import CompareExistence, Class, is_het

cmp = CompareExistence()
varfile = pysam.VariantFile(snakemake.input.calls)

with open(snakemake.output[0], "w", newline="") as outfile:
    writer = csv.writer(outfile, delimiter="\t")
    writer.writerow(
        [
            "class",
            "chromosome",
            "position",
            "ref_allele",
            "alt_allele",
            "true_genotype",
            "predicted_genotype",
        ]
    )
    for record in varfile:
        for c in cmp.classify(record):
            if c.cls == Class.FP and snakemake.wildcards.classification == "fp":
                classification = "FP"
                truth_gt = "0/0"
                query_gt = "0/1" if is_het(record, 1, c.variant) else "1/1"
            elif c.cls == Class.FN and snakemake.wildcards.classification == "fn":
                classification = "FN"
                truth_gt = "0/1" if is_het(record, 0, c.variant) else "1/1"
                query_gt = ""
            else:
                continue

            for alt in c.variant.alts:
                writer.writerow(
                    [
                        classification,
                        record.contig,
                        record.pos,
                        record.ref,
                        alt,
                        truth_gt,
                        query_gt,
                    ]
                )

Python pysam common From line 1 of scripts/extract-fp-fn.py

import sys

sys.stderr = open(snakemake.log[0], "w")

import json
from pysam import VariantFile


with VariantFile(snakemake.input[0]) as infile, open(
    snakemake.output[0], "w"
) as outfile:
    json.dump({"isempty": not any(infile)}, outfile)